You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2023/01/13 01:43:04 UTC
[doris] branch master updated: [refactor](remove unused code) remove buffer pool and disk io mgr (#15853)
This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 16862d9b43 [refactor](remove unused code) remove buffer pool and disk io mgr (#15853)
16862d9b43 is described below
commit 16862d9b4389b7061f57cb0df702ba4a5456ecec
Author: yiguolei <67...@qq.com>
AuthorDate: Fri Jan 13 09:42:58 2023 +0800
[refactor](remove unused code) remove buffer pool and disk io mgr (#15853)
* [refactor](remove buffer pool and disk io mgr) remove unused code
Co-authored-by: yiguolei <yi...@gmail.com>
---
be/src/common/config.h | 10 +-
be/src/common/daemon.cpp | 22 -
be/src/common/daemon.h | 3 -
be/src/exec/exec_node.cpp | 34 -
be/src/exec/exec_node.h | 16 -
be/src/runtime/CMakeLists.txt | 7 -
be/src/runtime/bufferpool/buffer_allocator.cc | 744 --------------
be/src/runtime/bufferpool/buffer_allocator.h | 241 -----
be/src/runtime/bufferpool/buffer_pool.cc | 667 ------------
be/src/runtime/bufferpool/buffer_pool.h | 466 ---------
be/src/runtime/bufferpool/buffer_pool_counters.h | 43 -
be/src/runtime/bufferpool/buffer_pool_internal.h | 299 ------
be/src/runtime/bufferpool/free_list.h | 115 ---
be/src/runtime/bufferpool/suballocator.cc | 252 -----
be/src/runtime/bufferpool/suballocator.h | 221 ----
be/src/runtime/bufferpool/system_allocator.cc | 166 ---
be/src/runtime/bufferpool/system_allocator.h | 49 -
be/src/runtime/disk_io_mgr.cc | 1195 ----------------------
be/src/runtime/disk_io_mgr.h | 837 ---------------
be/src/runtime/disk_io_mgr_internal.h | 455 --------
be/src/runtime/disk_io_mgr_reader_context.cc | 322 ------
be/src/runtime/disk_io_mgr_scan_range.cc | 481 ---------
be/src/runtime/exec_env.h | 10 -
be/src/runtime/exec_env_init.cpp | 48 -
be/src/runtime/runtime_state.h | 1 -
be/src/util/filesystem_util.h | 4 +-
be/test/CMakeLists.txt | 1 -
be/test/runtime/disk_io_mgr_test.cpp | 1069 -------------------
be/test/runtime/test_env.cc | 9 -
be/test/runtime/test_env.h | 3 -
30 files changed, 3 insertions(+), 7787 deletions(-)
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 394ffd71c4..021410f26b 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -424,7 +424,7 @@ CONF_Int32(min_buffer_size, "1024"); // 1024, The minimum read buffer size (in b
CONF_Int32(max_free_io_buffers, "128");
// Whether to disable the memory cache pool,
-// including MemPool, ChunkAllocator, BufferPool, DiskIO free buffer.
+// including MemPool, ChunkAllocator, DiskIO free buffer.
CONF_Bool(disable_mem_pools, "false");
// The reserved bytes limit of Chunk Allocator, usually set as a percentage of mem_limit.
@@ -473,14 +473,6 @@ CONF_Bool(madvise_huge_pages, "false");
// whether use mmap to allocate memory
CONF_Bool(mmap_buffers, "false");
-// max memory can be allocated by buffer pool
-// This is the percentage of mem_limit
-CONF_String(buffer_pool_limit, "20%");
-
-// clean page can be hold by buffer pool
-// This is the percentage of buffer_pool_limit
-CONF_String(buffer_pool_clean_pages_limit, "50%");
-
// Sleep time in milliseconds between memory maintenance iterations
CONF_mInt64(memory_maintenance_sleep_time_ms, "500");
diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp
index 296df8afbd..17e8046f0b 100644
--- a/be/src/common/daemon.cpp
+++ b/be/src/common/daemon.cpp
@@ -50,7 +50,6 @@
#include "geo/geo_functions.h"
#include "olap/options.h"
#include "runtime/block_spill_manager.h"
-#include "runtime/bufferpool/buffer_pool.h"
#include "runtime/exec_env.h"
#include "runtime/fragment_mgr.h"
#include "runtime/load_channel_mgr.h"
@@ -188,20 +187,6 @@ void Daemon::tcmalloc_gc_thread() {
#endif
}
-void Daemon::buffer_pool_gc_thread() {
- while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(10))) {
- ExecEnv* env = ExecEnv::GetInstance();
- // ExecEnv may not have been created yet or this may be the catalogd or statestored,
- // which don't have ExecEnvs.
- if (env != nullptr) {
- BufferPool* buffer_pool = env->buffer_pool();
- if (buffer_pool != nullptr) {
- buffer_pool->Maintenance();
- }
- }
- }
-}
-
void Daemon::memory_maintenance_thread() {
int64_t interval_milliseconds = config::memory_maintenance_sleep_time_ms;
while (!_stop_background_threads_latch.wait_for(
@@ -429,10 +414,6 @@ void Daemon::start() {
"Daemon", "tcmalloc_gc_thread", [this]() { this->tcmalloc_gc_thread(); },
&_tcmalloc_gc_thread);
CHECK(st.ok()) << st;
- st = Thread::create(
- "Daemon", "buffer_pool_gc_thread", [this]() { this->buffer_pool_gc_thread(); },
- &_buffer_pool_gc_thread);
- CHECK(st.ok()) << st;
st = Thread::create(
"Daemon", "memory_maintenance_thread", [this]() { this->memory_maintenance_thread(); },
&_memory_maintenance_thread);
@@ -467,9 +448,6 @@ void Daemon::stop() {
if (_tcmalloc_gc_thread) {
_tcmalloc_gc_thread->join();
}
- if (_buffer_pool_gc_thread) {
- _buffer_pool_gc_thread->join();
- }
if (_memory_maintenance_thread) {
_memory_maintenance_thread->join();
}
diff --git a/be/src/common/daemon.h b/be/src/common/daemon.h
index 39dbd4235f..525c1e1aa0 100644
--- a/be/src/common/daemon.h
+++ b/be/src/common/daemon.h
@@ -46,7 +46,6 @@ public:
private:
void tcmalloc_gc_thread();
- void buffer_pool_gc_thread();
void memory_maintenance_thread();
void load_channel_tracker_refresh_thread();
void calculate_metrics_thread();
@@ -54,8 +53,6 @@ private:
CountDownLatch _stop_background_threads_latch;
scoped_refptr<Thread> _tcmalloc_gc_thread;
- // only buffer pool gc, will be removed after.
- scoped_refptr<Thread> _buffer_pool_gc_thread;
scoped_refptr<Thread> _memory_maintenance_thread;
scoped_refptr<Thread> _load_channel_tracker_refresh_thread;
scoped_refptr<Thread> _calculate_metrics_thread;
diff --git a/be/src/exec/exec_node.cpp b/be/src/exec/exec_node.cpp
index 3a2d713403..9bd0c5c494 100644
--- a/be/src/exec/exec_node.cpp
+++ b/be/src/exec/exec_node.cpp
@@ -224,10 +224,6 @@ void ExecNode::release_resource(doris::RuntimeState* state) {
}
vectorized::VExpr::close(_projections, state);
- if (_buffer_pool_client.is_registered()) {
- state->exec_env()->buffer_pool()->DeregisterClient(&_buffer_pool_client);
- }
-
runtime_profile()->add_to_span();
_is_resource_released = true;
}
@@ -597,36 +593,6 @@ void ExecNode::init_runtime_profile(const std::string& name) {
_runtime_profile->set_metadata(_id);
}
-Status ExecNode::claim_buffer_reservation(RuntimeState* state) {
- DCHECK(!_buffer_pool_client.is_registered());
- BufferPool* buffer_pool = ExecEnv::GetInstance()->buffer_pool();
- // Check the minimum buffer size in case the minimum buffer size used by the planner
- // doesn't match this backend's.
- std::stringstream ss;
- if (_resource_profile.__isset.spillable_buffer_size &&
- _resource_profile.spillable_buffer_size < buffer_pool->min_buffer_len()) {
- ss << "Spillable buffer size for node " << _id << " of "
- << _resource_profile.spillable_buffer_size
- << "bytes is less than the minimum buffer pool buffer size of "
- << buffer_pool->min_buffer_len() << "bytes";
- return Status::InternalError(ss.str());
- }
-
- ss << print_plan_node_type(_type) << " id=" << _id << " ptr=" << this;
- RETURN_IF_ERROR(buffer_pool->RegisterClient(ss.str(), runtime_profile(), &_buffer_pool_client));
-
- /*
- if (debug_action_ == TDebugAction::SET_DENY_RESERVATION_PROBABILITY &&
- (debug_phase_ == TExecNodePhase::PREPARE || debug_phase_ == TExecNodePhase::OPEN)) {
- // We may not have been able to enable the debug action at the start of Prepare() or
- // Open() because the client is not registered then. Do it now to be sure that it is
- // effective.
- RETURN_IF_ERROR(EnableDenyReservationDebugAction());
- }
-*/
- return Status::OK();
-}
-
void ExecNode::release_block_memory(vectorized::Block& block, uint16_t child_idx) {
DCHECK(child_idx < _children.size());
block.clear_column_data(child(child_idx)->row_desc().num_materialized_slots());
diff --git a/be/src/exec/exec_node.h b/be/src/exec/exec_node.h
index ff95d96934..353db077f3 100644
--- a/be/src/exec/exec_node.h
+++ b/be/src/exec/exec_node.h
@@ -26,7 +26,6 @@
#include "common/status.h"
#include "gen_cpp/PlanNodes_types.h"
-#include "runtime/bufferpool/buffer_pool.h"
#include "runtime/descriptors.h"
#include "runtime/query_statistics.h"
#include "service/backend_options.h"
@@ -248,15 +247,6 @@ public:
protected:
friend class DataSink;
- /// Initialize 'buffer_pool_client_' and claim the initial reservation for this
- /// ExecNode. Only needs to be called by ExecNodes that will use the client.
- /// The client is automatically cleaned up in Close(). Should not be called if
- /// the client is already open.
- /// The ExecNode must return the initial reservation to
- /// QueryState::initial_reservations(), which is done automatically in Close() as long
- /// as the initial reservation is not released before Close().
- Status claim_buffer_reservation(RuntimeState* state);
-
/// Release all memory of block which got from child. The block
// 1. clear mem of valid column get from child, make sure child can reuse the mem
// 2. delete and release the column which create by function all and other reason
@@ -315,12 +305,6 @@ protected:
std::mutex _exec_options_lock;
std::string _runtime_exec_options;
- /// Buffer pool client for this node. Initialized with the node's minimum reservation
- /// in ClaimBufferReservation(). After initialization, the client must hold onto at
- /// least the minimum reservation so that it can be returned to the initial
- /// reservations pool in Close().
- BufferPool::ClientHandle _buffer_pool_client;
-
// Set to true if this is a vectorized exec node.
bool _is_vec = false;
diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt
index b2930ffb8a..dfc375bd24 100644
--- a/be/src/runtime/CMakeLists.txt
+++ b/be/src/runtime/CMakeLists.txt
@@ -56,16 +56,9 @@ set(RUNTIME_FILES
load_path_mgr.cpp
types.cpp
tmp_file_mgr.cc
- disk_io_mgr.cc
- disk_io_mgr_reader_context.cc
- disk_io_mgr_scan_range.cc
load_channel_mgr.cpp
load_channel.cpp
tablets_channel.cpp
- bufferpool/buffer_allocator.cc
- bufferpool/buffer_pool.cc
- bufferpool/suballocator.cc
- bufferpool/system_allocator.cc
snapshot_loader.cpp
query_statistics.cpp
message_body_sink.cpp
diff --git a/be/src/runtime/bufferpool/buffer_allocator.cc b/be/src/runtime/bufferpool/buffer_allocator.cc
deleted file mode 100644
index 1b2ef5c95a..0000000000
--- a/be/src/runtime/bufferpool/buffer_allocator.cc
+++ /dev/null
@@ -1,744 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/bufferpool/buffer_allocator.h"
-
-#include <mutex>
-
-#include "common/config.h"
-#include "runtime/bufferpool/system_allocator.h"
-#include "runtime/thread_context.h"
-#include "util/bit_util.h"
-#include "util/cpu_info.h"
-#include "util/pretty_printer.h"
-#include "util/runtime_profile.h"
-
-//DECLARE_bool(disable_mem_pools);
-
-namespace doris {
-
-/// Decrease 'bytes_remaining' by up to 'max_decrease', down to a minimum of 0.
-/// If 'require_full_decrease' is true, only decrease if we can decrease it
-/// 'max_decrease'. Returns the amount it was decreased by.
-static int64_t DecreaseBytesRemaining(int64_t max_decrease, bool require_full_decrease,
- std::atomic<int64_t>* bytes_remaining);
-
-/// An arena containing free buffers and clean pages that are associated with a
-/// particular core. All public methods are thread-safe.
-class BufferPool::FreeBufferArena : public CacheLineAligned {
-public:
- FreeBufferArena(BufferAllocator* parent);
-
- // Destructor should only run in backend tests.
- ~FreeBufferArena();
-
- /// Add a free buffer to the free lists. May free buffers to the system allocator
- /// if the list becomes full. Caller should not hold 'lock_'
- bool AddFreeBuffer(BufferHandle&& buffer);
-
- /// Try to get a free buffer of 'buffer_len' bytes from this arena. Returns true and
- /// sets 'buffer' if found or false if not found. Caller should not hold 'lock_'.
- bool PopFreeBuffer(int64_t buffer_len, BufferHandle* buffer);
-
- /*
- /// Try to get a buffer of 'buffer_len' bytes from this arena by evicting a clean page.
- /// Returns true and sets 'buffer' if a clean page was evicted or false otherwise.
- /// Caller should not hold 'lock_'
- bool EvictCleanPage(int64_t buffer_len, BufferHandle* buffer);
-*/
- /// Try to free 'target_bytes' of memory from this arena back to the system allocator.
- /// Up to 'target_bytes_to_claim' will be given back to the caller, so it can allocate
- /// a buffer of that size from the system. Any bytes freed in excess of
- /// 'target_bytes_to_claim' are added to 'system_bytes_remaining_'. Returns the actual
- /// number of bytes freed and the actual number of bytes claimed.
- ///
- /// Caller should not hold 'lock_'. If 'arena_lock' is non-null, ownership of the
- /// arena lock is transferred to the caller. Uses std::unique_lock instead of
- /// boost::std::unique_lock because it is movable.
- std::pair<int64_t, int64_t> FreeSystemMemory(int64_t target_bytes_to_free,
- int64_t target_bytes_to_claim,
- std::unique_lock<SpinLock>* arena_lock);
-
- /// Add a clean page to the arena. Caller must hold the page's client's lock and not
- /// hold 'lock_' or any Page::lock_.
- void AddCleanPage(Page* page);
-
- /// Removes the clean page from the arena if present. Returns true if removed. If
- /// 'claim_buffer' is true, the buffer is returned with the page, otherwise it is
- /// added to the free buffer list. Caller must hold the page's client's lock and
- /// not hold 'lock_' or any Page::lock_.
- bool RemoveCleanPage(bool claim_buffer, Page* page);
-
- /// Called periodically. Shrinks free lists that are holding onto more memory than
- /// needed.
- void Maintenance();
-
- /// Test helper: gets the current size of the free list for buffers of 'len' bytes
- /// on core 'core'.
- int GetFreeListSize(int64_t len);
-
- /// Return the total number of free buffers in the arena. May be approximate since
- /// it doesn't acquire the arena lock.
- int64_t GetNumFreeBuffers();
-
- /// Return the total bytes of free buffers in the arena. May be approximate since
- /// it doesn't acquire the arena lock.
- int64_t GetFreeBufferBytes();
-
- /// Return the total number of clean pages in the arena. May be approximate since
- /// it doesn't acquire the arena lock.
- int64_t GetNumCleanPages();
-
- std::string DebugString();
-
-private:
- /// The data structures for each power-of-two size of buffers/pages.
- /// All members are protected by FreeBufferArena::lock_ unless otherwise mentioned.
- struct PerSizeLists {
- PerSizeLists() : num_free_buffers(0), low_water_mark(0), num_clean_pages(0) {}
-
- /// Helper to add a free buffer and increment the counter.
- /// FreeBufferArena::lock_ must be held by the caller.
- void AddFreeBuffer(BufferHandle&& buffer) {
- DCHECK_EQ(num_free_buffers.load(std::memory_order_acquire), free_buffers.Size());
- num_free_buffers.fetch_add(1, std::memory_order_release);
- free_buffers.AddFreeBuffer(std::move(buffer));
- }
-
- /// The number of entries in 'free_buffers'. Can be read without holding a lock to
- /// allow threads to quickly skip over empty lists when trying to find a buffer.
- std::atomic<int64_t> num_free_buffers;
-
- /// Buffers that are not in use that were originally allocated on the core
- /// corresponding to this arena.
- FreeList free_buffers;
-
- /// The minimum size of 'free_buffers' since the last Maintenance() call.
- int low_water_mark;
-
- /// The number of entries in 'clean_pages'.
- /// Can be read without holding a lock to allow threads to quickly skip over empty
- /// lists when trying to find a buffer in a different arena.
- std::atomic<int64_t> num_clean_pages;
-
- /// Unpinned pages that have had their contents written to disk. These pages can be
- /// evicted to reclaim a buffer for any client. Pages are evicted in FIFO order,
- /// so that pages are evicted in approximately the same order that the clients wrote
- /// them to disk. Protected by FreeBufferArena::lock_.
- InternalList<Page> clean_pages;
- };
-
- /// Return the number of buffer sizes for this allocator.
- int NumBufferSizes() const {
- return parent_->log_max_buffer_len_ - parent_->log_min_buffer_len_ + 1;
- }
-
- /// Return the lists of buffers for buffers of the given length.
- PerSizeLists* GetListsForSize(int64_t buffer_len) {
- DCHECK(BitUtil::IsPowerOf2(buffer_len));
- int idx = BitUtil::Log2Ceiling64(buffer_len) - parent_->log_min_buffer_len_;
- DCHECK_LT(idx, NumBufferSizes());
- return &buffer_sizes_[idx];
- }
-
- /// Compute a sum over all the lists in the arena. Does not lock the arena.
- int64_t SumOverSizes(
- std::function<int64_t(PerSizeLists* lists, int64_t buffer_size)> compute_fn);
-
- BufferAllocator* const parent_;
-
- /// Protects all data structures in the arena. See buffer-pool-internal.h for lock
- /// order.
- SpinLock lock_;
-
- /// Free buffers and clean pages for each buffer size for this arena.
- /// Indexed by log2(bytes) - log2(min_buffer_len_).
- PerSizeLists buffer_sizes_[LOG_MAX_BUFFER_BYTES + 1];
-};
-
-int64_t BufferPool::BufferAllocator::CalcMaxBufferLen(int64_t min_buffer_len,
- int64_t system_bytes_limit) {
- // Find largest power of 2 smaller than 'system_bytes_limit'.
- int64_t upper_bound =
- system_bytes_limit == 0 ? 1L : 1L << BitUtil::Log2Floor64(system_bytes_limit);
- upper_bound = std::min(MAX_BUFFER_BYTES, upper_bound);
- return std::max(min_buffer_len, upper_bound); // Can't be < min_buffer_len.
-}
-
-BufferPool::BufferAllocator::BufferAllocator(BufferPool* pool, int64_t min_buffer_len,
- int64_t system_bytes_limit,
- int64_t clean_page_bytes_limit)
- : pool_(pool),
- system_allocator_(new SystemAllocator(min_buffer_len)),
- min_buffer_len_(min_buffer_len),
- max_buffer_len_(CalcMaxBufferLen(min_buffer_len, system_bytes_limit)),
- log_min_buffer_len_(BitUtil::Log2Ceiling64(min_buffer_len_)),
- log_max_buffer_len_(BitUtil::Log2Ceiling64(max_buffer_len_)),
- system_bytes_limit_(system_bytes_limit),
- system_bytes_remaining_(system_bytes_limit),
- clean_page_bytes_limit_(clean_page_bytes_limit),
- clean_page_bytes_remaining_(clean_page_bytes_limit),
- per_core_arenas_(CpuInfo::get_max_num_cores()),
- max_scavenge_attempts_(MAX_SCAVENGE_ATTEMPTS),
- _mem_tracker(std::make_unique<MemTracker>("BufferAllocator")) {
- DCHECK(BitUtil::IsPowerOf2(min_buffer_len_)) << min_buffer_len_;
- DCHECK(BitUtil::IsPowerOf2(max_buffer_len_)) << max_buffer_len_;
- DCHECK_LE(0, min_buffer_len_);
- DCHECK_LE(min_buffer_len_, max_buffer_len_);
- DCHECK_LE(max_buffer_len_, MAX_BUFFER_BYTES);
- DCHECK_LE(max_buffer_len_, std::max(system_bytes_limit_, min_buffer_len_));
-
- for (std::unique_ptr<FreeBufferArena>& arena : per_core_arenas_) {
- arena.reset(new FreeBufferArena(this));
- }
-}
-
-BufferPool::BufferAllocator::~BufferAllocator() {
- per_core_arenas_.clear(); // Release all the memory.
- // Check for accounting leaks.
- DCHECK_EQ(system_bytes_limit_, system_bytes_remaining_.load(std::memory_order_acquire));
- DCHECK_EQ(clean_page_bytes_limit_, clean_page_bytes_remaining_.load(std::memory_order_acquire));
-}
-
-Status BufferPool::BufferAllocator::Allocate(ClientHandle* client, int64_t len,
- BufferHandle* buffer) {
- SCOPED_TIMER(client->impl_->counters().alloc_time);
- COUNTER_UPDATE(client->impl_->counters().cumulative_bytes_alloced, len);
- COUNTER_UPDATE(client->impl_->counters().cumulative_allocations, 1);
-
- RETURN_IF_ERROR(AllocateInternal(len, buffer));
- DCHECK(buffer->is_open());
- buffer->client_ = client;
- return Status::OK();
-}
-
-Status BufferPool::BufferAllocator::AllocateInternal(int64_t len, BufferHandle* buffer) {
- DCHECK(!buffer->is_open());
- DCHECK_GE(len, min_buffer_len_);
- DCHECK(BitUtil::IsPowerOf2(len)) << len;
-
- std::stringstream err_stream;
- if (UNLIKELY(len > MAX_BUFFER_BYTES)) {
- err_stream << "Tried to allocate buffer of " << len << " bytes"
- << " max of " << MAX_BUFFER_BYTES << " bytes";
- return Status::InternalError(err_stream.str());
- }
- if (UNLIKELY(len > system_bytes_limit_)) {
- err_stream << "Tried to allocate buffer of " << len << " bytes"
- << " > buffer pool limit of " << system_bytes_limit_ << " bytes";
- return Status::InternalError(err_stream.str());
- }
-
- const int current_core = CpuInfo::get_current_core();
- // Fast path: recycle a buffer of the correct size from this core's arena.
- FreeBufferArena* current_core_arena = per_core_arenas_[current_core].get();
- if (current_core_arena->PopFreeBuffer(len, buffer)) return Status::OK();
-
- // Fast-ish path: allocate a new buffer if there is room in 'system_bytes_remaining_'.
- int64_t delta = DecreaseBytesRemaining(len, true, &system_bytes_remaining_);
- if (delta != len) {
- DCHECK_EQ(0, delta);
- const std::vector<int>& numa_node_cores =
- CpuInfo::get_cores_of_same_numa_node(current_core);
- const int numa_node_core_idx = CpuInfo::get_numa_node_core_idx(current_core);
-
- // Fast-ish path: find a buffer of the right size from another core on the same
- // NUMA node. Avoid getting a buffer from another NUMA node - prefer reclaiming
- // a clean page on this NUMA node or scavenging then reallocating a new buffer.
- // We don't want to get into a state where allocations between the nodes are
- // unbalanced and one node is stuck reusing memory allocated on the other node.
- for (int i = 1; i < numa_node_cores.size(); ++i) {
- // Each core should start searching from a different point to avoid hot-spots.
- int other_core = numa_node_cores[(numa_node_core_idx + i) % numa_node_cores.size()];
- FreeBufferArena* other_core_arena = per_core_arenas_[other_core].get();
- if (other_core_arena->PopFreeBuffer(len, buffer)) return Status::OK();
- }
-
- /*
- // Fast-ish path: evict a clean page of the right size from the current NUMA node.
- for (int i = 0; i < numa_node_cores.size(); ++i) {
- int other_core = numa_node_cores[(numa_node_core_idx + i) % numa_node_cores.size()];
- FreeBufferArena* other_core_arena = per_core_arenas_[other_core].get();
- if (other_core_arena->EvictCleanPage(len, buffer)) return Status::OK();
- }
-*/
- // Slow path: scavenge buffers of different sizes from free buffer lists and clean
- // pages. Make initial, fast attempts to gather the required buffers, before
- // finally making a slower, but guaranteed-to-succeed attempt.
- // TODO: IMPALA-4703: add a stress option where we vary the number of attempts
- // randomly.
- int attempt = 0;
- while (attempt < max_scavenge_attempts_ && delta < len) {
- bool final_attempt = attempt == max_scavenge_attempts_ - 1;
- delta += ScavengeBuffers(final_attempt, current_core, len - delta);
- ++attempt;
- }
- if (delta < len) {
- system_bytes_remaining_.fetch_add(delta, std::memory_order_release);
- // This indicates an accounting bug - we should be able to always get the memory.
- std::stringstream err_stream;
- err_stream << "Could not allocate : " << len << "bytes: was only able to free up "
- << delta << " bytes after " << max_scavenge_attempts_ << " attempts:\n"
- << pool_->DebugString();
- return Status::InternalError(err_stream.str());
- }
- }
- // We have headroom to allocate a new buffer at this point.
- DCHECK_EQ(delta, len);
- Status status = system_allocator_->Allocate(len, buffer);
- if (!status.ok()) {
- system_bytes_remaining_.fetch_add(len, std::memory_order_release);
- return status;
- }
- _mem_tracker->consume(len);
- return Status::OK();
-}
-
-int64_t DecreaseBytesRemaining(int64_t max_decrease, bool require_full_decrease,
- std::atomic<int64_t>* bytes_remaining) {
- while (true) {
- int64_t old_value = bytes_remaining->load(std::memory_order_acquire);
- if (require_full_decrease && old_value < max_decrease) return 0;
- int64_t decrease = std::min(old_value, max_decrease);
- int64_t new_value = old_value - decrease;
- if (bytes_remaining->compare_exchange_weak(old_value, new_value,
- std::memory_order_release)) {
- return decrease;
- }
- }
-}
-
-int64_t BufferPool::BufferAllocator::ScavengeBuffers(bool slow_but_sure, int current_core,
- int64_t target_bytes) {
- // There are two strategies for scavenging buffers:
- // 1) Fast, opportunistic: Each arena is searched in succession. Although reservations
- // guarantee that the memory we need is available somewhere, this may fail if we
- // we race with another thread that returned buffers to an arena that we've already
- // searched and took the buffers from an arena we haven't yet searched.
- // 2) Slow, guaranteed to succeed: In order to ensure that we can find the memory in a
- // single pass, we hold locks for all arenas we've already examined. That way, other
- // threads can't take the memory that we need from an arena that we haven't yet
- // examined (or from 'system_bytes_available_') because in order to do so, it would
- // have had to return the equivalent amount of memory to an earlier arena or added
- // it back into 'systems_bytes_remaining_'. The former can't happen since we're
- // still holding those locks, and the latter is solved by trying to decrease
- // system_bytes_remaining_ with DecreaseBytesRemaining() at the end.
- DCHECK_GT(target_bytes, 0);
- // First make sure we've used up all the headroom in the buffer limit.
- int64_t bytes_found = DecreaseBytesRemaining(target_bytes, false, &system_bytes_remaining_);
- if (bytes_found == target_bytes) return bytes_found;
-
- // In 'slow_but_sure' mode, we will hold locks for multiple arenas at the same time and
- // therefore must start at 0 to respect the lock order. Otherwise we start with the
- // current core's arena for locality and to avoid excessive contention on arena 0.
- int start_core = slow_but_sure ? 0 : current_core;
- std::vector<std::unique_lock<SpinLock>> arena_locks;
- if (slow_but_sure) arena_locks.resize(per_core_arenas_.size());
-
- for (int i = 0; i < per_core_arenas_.size(); ++i) {
- int core_to_check = (start_core + i) % per_core_arenas_.size();
- FreeBufferArena* arena = per_core_arenas_[core_to_check].get();
- int64_t bytes_needed = target_bytes - bytes_found;
- bytes_found += arena->FreeSystemMemory(bytes_needed, bytes_needed,
- slow_but_sure ? &arena_locks[i] : nullptr)
- .second;
- if (bytes_found == target_bytes) break;
- }
- DCHECK_LE(bytes_found, target_bytes);
-
- // Decrement 'system_bytes_remaining_' while still holding the arena locks to avoid
- // the window for a race with another thread that removes a buffer from a list and
- // then increments 'system_bytes_remaining_'. The race is prevented because the other
- // thread holds the lock while decrementing 'system_bytes_remaining_' in the cases
- // where it may not have reservation corresponding to that memory.
- if (slow_but_sure && bytes_found < target_bytes) {
- bytes_found +=
- DecreaseBytesRemaining(target_bytes - bytes_found, true, &system_bytes_remaining_);
- // Deadlock in arena_locks in BufferPool::BufferAllocator::ScavengeBuffers and _lock in DebugString
- // DCHECK_EQ(bytes_found, target_bytes) << DebugString();
- }
- return bytes_found;
-}
-
-void BufferPool::BufferAllocator::Free(BufferHandle&& handle) {
- DCHECK(handle.is_open());
- handle.client_ = nullptr; // Buffer is no longer associated with a client.
- FreeBufferArena* arena = per_core_arenas_[handle.home_core_].get();
- handle.Poison();
- if (!arena->AddFreeBuffer(std::move(handle))) {
- _mem_tracker->release(handle.len());
- }
-}
-
-void BufferPool::BufferAllocator::AddCleanPage(const std::unique_lock<std::mutex>& client_lock,
- Page* page) {
- page->client->DCheckHoldsLock(client_lock);
- FreeBufferArena* arena = per_core_arenas_[page->buffer.home_core_].get();
- arena->AddCleanPage(page);
-}
-
-bool BufferPool::BufferAllocator::RemoveCleanPage(const std::unique_lock<std::mutex>& client_lock,
- bool claim_buffer, Page* page) {
- page->client->DCheckHoldsLock(client_lock);
- FreeBufferArena* arena;
- {
- std::lock_guard<SpinLock> pl(page->buffer_lock);
- // Page may be evicted - in which case it has no home core and is not in an arena.
- if (!page->buffer.is_open()) return false;
- arena = per_core_arenas_[page->buffer.home_core_].get();
- }
- return arena->RemoveCleanPage(claim_buffer, page);
-}
-
-void BufferPool::BufferAllocator::Maintenance() {
- for (std::unique_ptr<FreeBufferArena>& arena : per_core_arenas_) arena->Maintenance();
-}
-
-void BufferPool::BufferAllocator::ReleaseMemory(int64_t bytes_to_free) {
- int64_t bytes_freed = 0;
- int current_core = CpuInfo::get_current_core();
- for (int i = 0; i < per_core_arenas_.size(); ++i) {
- int core_to_check = (current_core + i) % per_core_arenas_.size();
- FreeBufferArena* arena = per_core_arenas_[core_to_check].get();
- // Free but don't claim any memory.
- bytes_freed += arena->FreeSystemMemory(bytes_to_free - bytes_freed, 0, nullptr).first;
- if (bytes_freed >= bytes_to_free) return;
- }
-}
-
-int BufferPool::BufferAllocator::GetFreeListSize(int core, int64_t len) {
- return per_core_arenas_[core]->GetFreeListSize(len);
-}
-
-int64_t BufferPool::BufferAllocator::FreeToSystem(std::vector<BufferHandle>&& buffers) {
- int64_t bytes_freed = 0;
- for (BufferHandle& buffer : buffers) {
- bytes_freed += buffer.len();
- // Ensure that the memory is unpoisoned when it's next allocated by the system.
- buffer.Unpoison();
- system_allocator_->Free(std::move(buffer));
- }
- _mem_tracker->release(bytes_freed);
- return bytes_freed;
-}
-
-int64_t BufferPool::BufferAllocator::SumOverArenas(
- std::function<int64_t(FreeBufferArena* arena)> compute_fn) const {
- int64_t total = 0;
- for (const std::unique_ptr<FreeBufferArena>& arena : per_core_arenas_) {
- total += compute_fn(arena.get());
- }
- return total;
-}
-
-int64_t BufferPool::BufferAllocator::GetNumFreeBuffers() const {
- return SumOverArenas([](FreeBufferArena* arena) { return arena->GetNumFreeBuffers(); });
-}
-
-int64_t BufferPool::BufferAllocator::GetFreeBufferBytes() const {
- return SumOverArenas([](FreeBufferArena* arena) { return arena->GetFreeBufferBytes(); });
-}
-
-int64_t BufferPool::BufferAllocator::GetNumCleanPages() const {
- return SumOverArenas([](FreeBufferArena* arena) { return arena->GetNumCleanPages(); });
-}
-
-int64_t BufferPool::BufferAllocator::GetCleanPageBytesLimit() const {
- return clean_page_bytes_limit_;
-}
-
-int64_t BufferPool::BufferAllocator::GetCleanPageBytes() const {
- return clean_page_bytes_limit_ - clean_page_bytes_remaining_.load(std::memory_order_acquire);
-}
-
-std::string BufferPool::BufferAllocator::DebugString() {
- std::stringstream ss;
- ss << "<BufferAllocator> " << this << " min_buffer_len: " << min_buffer_len_
- << " system_bytes_limit: " << system_bytes_limit_
- << " system_bytes_remaining: " << system_bytes_remaining_.load(std::memory_order_acquire)
- << "\n"
- << " clean_page_bytes_limit: " << clean_page_bytes_limit_ << " clean_page_bytes_remaining: "
- << clean_page_bytes_remaining_.load(std::memory_order_acquire) << "\n";
- for (int i = 0; i < per_core_arenas_.size(); ++i) {
- ss << " Arena " << i << " " << per_core_arenas_[i]->DebugString() << "\n";
- }
- return ss.str();
-}
-
-BufferPool::FreeBufferArena::FreeBufferArena(BufferAllocator* parent) : parent_(parent) {}
-
-BufferPool::FreeBufferArena::~FreeBufferArena() {
- for (int i = 0; i < NumBufferSizes(); ++i) {
- // Clear out the free lists.
- FreeList* list = &buffer_sizes_[i].free_buffers;
- std::vector<BufferHandle> buffers = list->GetBuffersToFree(list->Size());
- parent_->system_bytes_remaining_.fetch_add(parent_->FreeToSystem(std::move(buffers)),
- std::memory_order_release);
-
- // All pages should have been destroyed.
- DCHECK_EQ(0, buffer_sizes_[i].clean_pages.size());
- }
-}
-
-bool BufferPool::FreeBufferArena::AddFreeBuffer(BufferHandle&& buffer) {
- std::lock_guard<SpinLock> al(lock_);
- if (config::disable_mem_pools) {
- int64_t len = buffer.len();
- parent_->system_allocator_->Free(std::move(buffer));
- parent_->system_bytes_remaining_.fetch_add(len, std::memory_order_release);
- return false;
- }
- PerSizeLists* lists = GetListsForSize(buffer.len());
- lists->AddFreeBuffer(std::move(buffer));
- return true;
-}
-
-bool BufferPool::FreeBufferArena::RemoveCleanPage(bool claim_buffer, Page* page) {
- std::lock_guard<SpinLock> al(lock_);
- PerSizeLists* lists = GetListsForSize(page->len);
- DCHECK_EQ(lists->num_clean_pages.load(std::memory_order_acquire), lists->clean_pages.size());
- if (!lists->clean_pages.remove(page)) return false;
- lists->num_clean_pages.fetch_sub(1, std::memory_order_release);
- parent_->clean_page_bytes_remaining_.fetch_add(page->len, std::memory_order_release);
- if (!claim_buffer) {
- BufferHandle buffer;
- {
- std::lock_guard<SpinLock> pl(page->buffer_lock);
- buffer = std::move(page->buffer);
- }
- lists->AddFreeBuffer(std::move(buffer));
- }
- return true;
-}
-
-bool BufferPool::FreeBufferArena::PopFreeBuffer(int64_t buffer_len, BufferHandle* buffer) {
- PerSizeLists* lists = GetListsForSize(buffer_len);
- // Check before acquiring lock.
- if (lists->num_free_buffers.load(std::memory_order_acquire) == 0) return false;
-
- std::lock_guard<SpinLock> al(lock_);
- FreeList* list = &lists->free_buffers;
- DCHECK_EQ(lists->num_free_buffers.load(std::memory_order_acquire), list->Size());
- if (!list->PopFreeBuffer(buffer)) return false;
- buffer->Unpoison();
- lists->num_free_buffers.fetch_sub(1, std::memory_order_release);
- lists->low_water_mark = std::min<int>(lists->low_water_mark, list->Size());
- return true;
-}
-/*
-bool BufferPool::FreeBufferArena::EvictCleanPage(
- int64_t buffer_len, BufferHandle* buffer) {
- PerSizeLists* lists = GetListsForSize(buffer_len);
- // Check before acquiring lock.
- if (lists->num_clean_pages.Load() == 0) return false;
-
- std::lock_guard<SpinLock> al(lock_);
- DCHECK_EQ(lists->num_clean_pages.Load(), lists->clean_pages.size());
- Page* page = lists->clean_pages.dequeue();
- if (page == nullptr) return false;
- lists->num_clean_pages.Add(-1);
- parent_->clean_page_bytes_remaining_.Add(buffer_len);
- std::lock_guard<SpinLock> pl(page->buffer_lock);
- *buffer = std::move(page->buffer);
- return true;
-}
-*/
-std::pair<int64_t, int64_t> BufferPool::FreeBufferArena::FreeSystemMemory(
- int64_t target_bytes_to_free, int64_t target_bytes_to_claim,
- std::unique_lock<SpinLock>* arena_lock) {
- DCHECK_GT(target_bytes_to_free, 0);
- DCHECK_GE(target_bytes_to_free, target_bytes_to_claim);
- int64_t bytes_freed = 0;
- // If the caller is acquiring the lock, just lock for the whole method.
- // Otherwise lazily acquire the lock the first time we find some memory
- // to free.
- std::unique_lock<SpinLock> al(lock_, std::defer_lock_t());
- if (arena_lock != nullptr) al.lock();
-
- std::vector<BufferHandle> buffers;
- // Search from largest to smallest to avoid freeing many small buffers unless
- // necessary.
- for (int i = NumBufferSizes() - 1; i >= 0; --i) {
- PerSizeLists* lists = &buffer_sizes_[i];
- // Check before acquiring lock to avoid expensive lock acquisition and make scanning
- // empty lists much cheaper.
- if (lists->num_free_buffers.load(std::memory_order_acquire) == 0 &&
- lists->num_clean_pages.load(std::memory_order_acquire) == 0) {
- continue;
- }
- if (!al.owns_lock()) al.lock();
- FreeList* free_buffers = &lists->free_buffers;
- InternalList<Page>* clean_pages = &lists->clean_pages;
- DCHECK_EQ(lists->num_free_buffers.load(std::memory_order_acquire), free_buffers->Size());
- DCHECK_EQ(lists->num_clean_pages.load(std::memory_order_acquire), clean_pages->size());
-
- // Figure out how many of the buffers in the free list we should free.
- DCHECK_GT(target_bytes_to_free, bytes_freed);
- const int64_t buffer_len = 1L << (i + parent_->log_min_buffer_len_);
- int64_t buffers_to_free =
- std::min(free_buffers->Size(),
- BitUtil::Ceil(target_bytes_to_free - bytes_freed, buffer_len));
- int64_t buffer_bytes_to_free = buffers_to_free * buffer_len;
-
- // Evict clean pages by moving their buffers to the free page list before freeing
- // them. This ensures that they are freed based on memory address in the expected
- // order.
- int num_pages_evicted = 0;
- int64_t page_bytes_evicted = 0;
- while (bytes_freed + buffer_bytes_to_free < target_bytes_to_free) {
- Page* page = clean_pages->dequeue();
- if (page == nullptr) break;
- BufferHandle page_buffer;
- {
- std::lock_guard<SpinLock> pl(page->buffer_lock);
- page_buffer = std::move(page->buffer);
- }
- ++buffers_to_free;
- buffer_bytes_to_free += page_buffer.len();
- ++num_pages_evicted;
- page_bytes_evicted += page_buffer.len();
- free_buffers->AddFreeBuffer(std::move(page_buffer));
- }
- lists->num_free_buffers.fetch_add(num_pages_evicted, std::memory_order_release);
- lists->num_clean_pages.fetch_sub(num_pages_evicted, std::memory_order_release);
- parent_->clean_page_bytes_remaining_.fetch_add(page_bytes_evicted,
- std::memory_order_release);
-
- if (buffers_to_free > 0) {
- int64_t buffer_bytes_freed =
- parent_->FreeToSystem(free_buffers->GetBuffersToFree(buffers_to_free));
- DCHECK_EQ(buffer_bytes_to_free, buffer_bytes_freed);
- bytes_freed += buffer_bytes_to_free;
- lists->num_free_buffers.fetch_sub(buffers_to_free, std::memory_order_release);
- lists->low_water_mark = std::min<int>(lists->low_water_mark, free_buffers->Size());
- if (bytes_freed >= target_bytes_to_free) break;
- }
- // Should have cleared out all lists if we don't have enough memory at this point.
- DCHECK_EQ(0, free_buffers->Size());
- DCHECK_EQ(0, clean_pages->size());
- }
- int64_t bytes_claimed = std::min(bytes_freed, target_bytes_to_claim);
- if (bytes_freed > bytes_claimed) {
- // Add back the extra for other threads before releasing the lock to avoid race
- // where the other thread may not be able to find enough buffers.
- parent_->system_bytes_remaining_.fetch_add((bytes_freed - bytes_claimed),
- std::memory_order_release);
- }
- if (arena_lock != nullptr) *arena_lock = std::move(al);
- return std::make_pair(bytes_freed, bytes_claimed);
-}
-
-void BufferPool::FreeBufferArena::AddCleanPage(Page* page) {
- bool eviction_needed =
- config::disable_mem_pools ||
- DecreaseBytesRemaining(page->len, true, &parent_->clean_page_bytes_remaining_) == 0;
- std::lock_guard<SpinLock> al(lock_);
- PerSizeLists* lists = GetListsForSize(page->len);
- DCHECK_EQ(lists->num_clean_pages.load(std::memory_order_acquire), lists->clean_pages.size());
- if (eviction_needed) {
- if (lists->clean_pages.empty()) {
- // No other pages to evict, must evict 'page' instead of adding it.
- lists->AddFreeBuffer(std::move(page->buffer));
- } else {
- // Evict an older page (FIFO eviction) to make space for this one.
- Page* page_to_evict = lists->clean_pages.dequeue();
- lists->clean_pages.enqueue(page);
- BufferHandle page_to_evict_buffer;
- {
- std::lock_guard<SpinLock> pl(page_to_evict->buffer_lock);
- page_to_evict_buffer = std::move(page_to_evict->buffer);
- }
- lists->AddFreeBuffer(std::move(page_to_evict_buffer));
- }
- } else {
- lists->clean_pages.enqueue(page);
- lists->num_clean_pages.fetch_add(1, std::memory_order_release);
- }
-}
-
-void BufferPool::FreeBufferArena::Maintenance() {
- std::lock_guard<SpinLock> al(lock_);
- for (int i = 0; i < NumBufferSizes(); ++i) {
- PerSizeLists* lists = &buffer_sizes_[i];
- DCHECK_LE(lists->low_water_mark, lists->free_buffers.Size());
- if (lists->low_water_mark != 0) {
- // We haven't needed the buffers below the low water mark since the previous
- // Maintenance() call. Discard half of them to free up memory. By always discarding
- // at least one, we guarantee that an idle list will shrink to zero entries.
- int num_to_free = std::max(1, lists->low_water_mark / 2);
- parent_->system_bytes_remaining_.fetch_add(
- parent_->FreeToSystem(lists->free_buffers.GetBuffersToFree(num_to_free)),
- std::memory_order_release);
- lists->num_free_buffers.fetch_sub(num_to_free, std::memory_order_release);
- }
- lists->low_water_mark = lists->free_buffers.Size();
- }
-}
-
-int BufferPool::FreeBufferArena::GetFreeListSize(int64_t len) {
- std::lock_guard<SpinLock> al(lock_);
- PerSizeLists* lists = GetListsForSize(len);
- DCHECK_EQ(lists->num_free_buffers.load(std::memory_order_acquire), lists->free_buffers.Size());
- return lists->free_buffers.Size();
-}
-
-int64_t BufferPool::FreeBufferArena::SumOverSizes(
- std::function<int64_t(PerSizeLists* lists, int64_t buffer_size)> compute_fn) {
- int64_t total = 0;
- for (int i = 0; i < NumBufferSizes(); ++i) {
- int64_t buffer_size = (1L << i) * parent_->min_buffer_len_;
- total += compute_fn(&buffer_sizes_[i], buffer_size);
- }
- return total;
-}
-
-int64_t BufferPool::FreeBufferArena::GetNumFreeBuffers() {
- return SumOverSizes([](PerSizeLists* lists, int64_t buffer_size) {
- return lists->num_free_buffers.load(std::memory_order_acquire);
- });
-}
-
-int64_t BufferPool::FreeBufferArena::GetFreeBufferBytes() {
- return SumOverSizes([](PerSizeLists* lists, int64_t buffer_size) {
- return lists->num_free_buffers.load(std::memory_order_acquire) * buffer_size;
- });
-}
-
-int64_t BufferPool::FreeBufferArena::GetNumCleanPages() {
- return SumOverSizes([](PerSizeLists* lists, int64_t buffer_size) {
- return lists->num_clean_pages.load(std::memory_order_acquire);
- });
-}
-
-std::string BufferPool::FreeBufferArena::DebugString() {
- std::lock_guard<SpinLock> al(lock_);
- std::stringstream ss;
- ss << "<FreeBufferArena> " << this << "\n";
- for (int i = 0; i < NumBufferSizes(); ++i) {
- int64_t buffer_len = 1L << (parent_->log_min_buffer_len_ + i);
- PerSizeLists& lists = buffer_sizes_[i];
- ss << " " << PrettyPrinter::print_bytes(buffer_len) << ":"
- << " free buffers: " << lists.num_free_buffers.load(std::memory_order_acquire)
- << " low water mark: " << lists.low_water_mark
- << " clean pages: " << lists.num_clean_pages.load(std::memory_order_acquire) << " ";
- lists.clean_pages.iterate(
- std::bind<bool>(Page::DebugStringCallback, &ss, std::placeholders::_1));
- ss << "\n";
- }
- return ss.str();
-}
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/buffer_allocator.h b/be/src/runtime/bufferpool/buffer_allocator.h
deleted file mode 100644
index cf2a0f741e..0000000000
--- a/be/src/runtime/bufferpool/buffer_allocator.h
+++ /dev/null
@@ -1,241 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "runtime/bufferpool/buffer_pool_internal.h"
-#include "runtime/bufferpool/free_list.h"
-#include "runtime/memory/mem_tracker.h"
-#include "util/aligned_new.h"
-
-namespace doris {
-
-/// The internal buffer allocator used by BufferPool to allocator power-of-two sized
-/// buffers. BufferAllocator builds on top of SystemAllocator by adding caching of
-/// free buffers and clean pages where the memory is not currently in use by a client
-/// but has not yet been released to SystemAllocator.
-///
-/// The allocator is optimised for the common case where an allocation can be served
-/// by reclaiming a buffer of the request size from the current core's arena. In this
-/// case there is no contention for locks between concurrently-running threads. If this
-/// fails, progressively more expensive approaches to allocate memory are tried until
-/// the allocation eventually success (see AllocateInternal() for details).
-///
-/// Buffer Reservations
-/// ===================
-/// The implementation of the BufferAllocator relies on the BufferPool's reservation
-/// tracking system. The allocator is given a hard limit ('system_bytes_limit'), above
-/// which all allocations will fail. Allocations up to 'system_bytes_limit' are
-/// guaranteed to succeed unless an unexpected system error occurs (e.g. we can't allocate
-/// all of the required memory from the OS). Reservations must be set up so that the total
-/// of all reservations does not exceed 'system_bytes_limit', thus ensuring that
-/// BufferAllocator can always find memory to fulfill reservations.
-///
-/// +========================+
-/// | IMPLEMENTATION NOTES |
-/// +========================+
-///
-/// Memory
-/// ======
-/// Memory managed by BufferAllocator comes in four forms:
-/// 1. Buffers returned to the client (corresponding to a used reservation)
-/// 2. Free buffers cached in the BufferAllocator's free lists.
-/// 3. Buffers attached to clean unpinned pages in the BufferAllocator's clean page lists.
-/// 4. Bytes that are not allocated from the system: 'system_bytes_remaining_'.
-/// Together these always add up to 'system_bytes_limit', which allows BufferAllocator
-/// to always fulfill reservations via some combination of memory in forms 2, 3 or 4.
-///
-/// The BufferAllocator code is careful not to make memory inaccessible to concurrently
-/// executing threads that are entitled to it. E.g. if one thread is entitled to allocate
-/// a 1MB buffer from the BufferAllocator's free or clean page lists but needs to release
-/// a 2MB buffer to the system to free up enough memory, it must add 1MB to
-/// 'system_bytes_remaining_' in the same critical section in which it freed the 2MB
-/// buffer. Otherwise a concurrent thread that had a reservation for 1MB of memory might
-/// not be able to find it.
-///
-/// Arenas
-/// ======
-/// The buffer allocator's data structures are broken up into arenas, with an arena per
-/// core. Within each arena, each buffer or page is stored in a list with buffers and
-/// pages of the same size: there is a separate list for every power-of-two size. Each
-/// arena is protected by a separate lock, so in the common case where threads are able
-/// to fulfill allocations from their own arena, there will be no lock contention.
-///
-struct BufferPool::BufferAllocator {
- BufferAllocator(BufferPool* pool, int64_t min_buffer_len, int64_t system_bytes_limit,
- int64_t clean_page_bytes_limit);
- ~BufferAllocator();
-
- /// Allocate a buffer with a power-of-two length 'len'. This function may acquire
- /// 'FreeBufferArena::lock_' and Page::lock so no locks lower in the lock acquisition
- /// order (see buffer-pool-internal.h) should be held by the caller.
- ///
- /// Always succeeds on allocating memory up to 'system_bytes_limit', unless the system
- /// is unable to give us 'system_bytes_limit' of memory or an internal bug: if all
- /// clients write out enough dirty pages to stay within their reservation, then there
- /// should always be enough free buffers and clean pages to reclaim.
- Status Allocate(ClientHandle* client, int64_t len,
- BufferPool::BufferHandle* buffer) WARN_UNUSED_RESULT;
-
- /// Frees 'buffer', which must be open before calling. Closes 'buffer' and updates
- /// internal state but does not release to any reservation.
- void Free(BufferPool::BufferHandle&& buffer);
-
- /// Adds a clean page 'page' to a clean page list. Caller must hold the page's
- /// client's lock via 'client_lock' so that moving the page between the client list and
- /// the free page list is atomic. Caller must not hold 'FreeBufferArena::lock_' or any
- /// Page::lock.
- void AddCleanPage(const std::unique_lock<std::mutex>& client_lock, Page* page);
-
- /// Removes a clean page 'page' from a clean page list and returns true, if present in
- /// one of the lists. Returns true if it was present. If 'claim_buffer' is true, the
- /// caller must have reservation for the buffer, which is returned along with the page.
- /// Otherwise the buffer is moved directly to the free buffer list. Caller must hold
- /// the page's client's lock via 'client_lock' so that moving the page between the
- /// client list and the free page list is atomic. Caller must not hold
- /// 'FreeBufferArena::lock_' or any Page::lock.
- bool RemoveCleanPage(const std::unique_lock<std::mutex>& client_lock, bool claim_buffer,
- Page* page);
-
- /// Periodically called to release free buffers back to the SystemAllocator. Releases
- /// buffers based on recent allocation patterns, trying to minimise the number of
- /// excess buffers retained in each list above the minimum required to avoid going
- /// to the system allocator.
- void Maintenance();
-
- /// Try to release at least 'bytes_to_free' bytes of memory to the system allocator.
- void ReleaseMemory(int64_t bytes_to_free);
-
- int64_t system_bytes_limit() const { return system_bytes_limit_; }
-
- /// Return the amount of memory currently allocated from the system.
- int64_t GetSystemBytesAllocated() const {
- return system_bytes_limit_ - system_bytes_remaining_.load();
- }
-
- /// Return the total number of free buffers in the allocator.
- int64_t GetNumFreeBuffers() const;
-
- /// Return the total bytes of free buffers in the allocator.
- int64_t GetFreeBufferBytes() const;
-
- /// Return the limit on bytes of clean pages in the allocator.
- int64_t GetCleanPageBytesLimit() const;
-
- /// Return the total number of clean pages in the allocator.
- int64_t GetNumCleanPages() const;
-
- /// Return the total bytes of clean pages in the allocator.
- int64_t GetCleanPageBytes() const;
-
- std::string DebugString();
-
-protected:
- friend class BufferAllocatorTest;
- friend class BufferPoolTest;
- friend class FreeBufferArena;
-
- /// Test helper: gets the current size of the free list for buffers of 'len' bytes
- /// on core 'core'.
- int GetFreeListSize(int core, int64_t len);
-
- /// Test helper: reduce the number of scavenge attempts so backend tests can force
- /// use of the "locked" scavenging code path.
- void set_max_scavenge_attempts(int val) {
- DCHECK_GE(val, 1);
- max_scavenge_attempts_ = val;
- }
-
-private:
- /// Compute the maximum power-of-two buffer length that could be allocated based on the
- /// amount of memory available 'system_bytes_limit'. The value is always at least
- /// 'min_buffer_len' so that there is at least one valid buffer size.
- static int64_t CalcMaxBufferLen(int64_t min_buffer_len, int64_t system_bytes_limit);
-
- /// Same as Allocate() but leaves 'buffer->client_' nullptr and does not update counters.
- Status AllocateInternal(int64_t len, BufferPool::BufferHandle* buffer) WARN_UNUSED_RESULT;
-
- /// Tries to reclaim enough memory from various sources so that the caller can allocate
- /// a buffer of 'target_bytes' from the system allocator. Scavenges buffers from the
- /// free buffer and clean page lists of all cores and frees them with
- /// 'system_allocator_'. Also tries to decrement 'system_bytes_remaining_'.
- /// 'current_core' is the index of the current CPU core. Any bytes freed in excess of
- /// 'target_bytes' are added to 'system_bytes_remaining_.' If 'slow_but_sure' is true,
- /// this function uses a slower strategy that guarantees enough memory will be found
- /// but can block progress of other threads for longer. If 'slow_but_sure' is false,
- /// then this function optimistically tries to reclaim the memory but may not reclaim
- /// 'target_bytes' of memory. Returns the number of bytes reclaimed.
- int64_t ScavengeBuffers(bool slow_but_sure, int current_core, int64_t target_bytes);
-
- /// Helper to free a list of buffers to the system. Returns the number of bytes freed.
- int64_t FreeToSystem(std::vector<BufferHandle>&& buffers);
-
- /// Compute a sum over all arenas. Does not lock the arenas.
- int64_t SumOverArenas(std::function<int64_t(FreeBufferArena* arena)> compute_fn) const;
-
- /// The pool that this allocator is associated with.
- BufferPool* const pool_;
-
- /// System allocator that is ultimately used to allocate and free buffers.
- const std::unique_ptr<SystemAllocator> system_allocator_;
-
- /// The minimum power-of-two buffer length that can be allocated.
- const int64_t min_buffer_len_;
-
- /// The maximum power-of-two buffer length that can be allocated. Always >=
- /// 'min_buffer_len' so that there is at least one valid buffer size.
- const int64_t max_buffer_len_;
-
- /// The log2 of 'min_buffer_len_'.
- const int log_min_buffer_len_;
-
- /// The log2 of 'max_buffer_len_'.
- const int log_max_buffer_len_;
-
- /// The maximum physical memory in bytes that will be allocated from the system.
- const int64_t system_bytes_limit_;
-
- /// The remaining number of bytes of 'system_bytes_limit_' that can be used for
- /// allocating new buffers. Must be updated atomically before a new buffer is
- /// allocated or after an existing buffer is freed with the system allocator.
- std::atomic<int64_t> system_bytes_remaining_;
-
- /// The maximum bytes of clean pages that can accumulate across all arenas before
- /// they will be evicted.
- const int64_t clean_page_bytes_limit_;
-
- /// The number of bytes of 'clean_page_bytes_limit_' not used by clean pages. I.e.
- /// (clean_page_bytes_limit - bytes of clean pages in the BufferAllocator).
- /// 'clean_pages_bytes_limit_' is enforced by increasing this value before a
- /// clean page is added and decreasing it after a clean page is reclaimed or evicted.
- std::atomic<int64_t> clean_page_bytes_remaining_;
-
- /// Free and clean pages. One arena per core.
- std::vector<std::unique_ptr<FreeBufferArena>> per_core_arenas_;
-
- /// Default number of times to attempt scavenging.
- static const int MAX_SCAVENGE_ATTEMPTS = 3;
-
- /// Number of times to attempt scavenging. Usually MAX_SCAVENGE_ATTEMPTS but can be
- /// overridden by tests. The first max_scavenge_attempts_ - 1 attempts do not lock
- /// all arenas so may fail. The final attempt locks all arenas, which is expensive
- /// but is guaranteed to succeed.
- int max_scavenge_attempts_;
-
- std::unique_ptr<MemTracker> _mem_tracker;
-};
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/buffer_pool.cc b/be/src/runtime/bufferpool/buffer_pool.cc
deleted file mode 100644
index 9d11c0f58d..0000000000
--- a/be/src/runtime/bufferpool/buffer_pool.cc
+++ /dev/null
@@ -1,667 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <limits>
-#include <sstream>
-
-#include "gutil/strings/substitute.h"
-#include "runtime/bufferpool/buffer_allocator.h"
-#include "runtime/bufferpool/buffer_pool_internal.h"
-#include "util/bit_util.h"
-#include "util/cpu_info.h"
-#include "util/runtime_profile.h"
-#include "util/time.h"
-#include "util/uid_util.h"
-
-//DEFINE_int32(concurrent_scratch_ios_per_device, 2,
-// "Set this to influence the number of concurrent write I/Os issues to write data to "
-// "scratch files. This is multiplied by the number of active scratch directories to "
-// "obtain the target number of scratch write I/Os per query.");
-
-namespace doris {
-
-constexpr int BufferPool::LOG_MAX_BUFFER_BYTES;
-constexpr int64_t BufferPool::MAX_BUFFER_BYTES;
-
-void BufferPool::BufferHandle::Open(uint8_t* data, int64_t len, int home_core) {
- DCHECK_LE(0, home_core);
- DCHECK_LT(home_core, CpuInfo::get_max_num_cores());
- client_ = nullptr;
- data_ = data;
- len_ = len;
- home_core_ = home_core;
-}
-
-BufferPool::PageHandle::PageHandle() {
- Reset();
-}
-
-BufferPool::PageHandle::PageHandle(PageHandle&& src) {
- Reset();
- *this = std::move(src);
-}
-
-BufferPool::PageHandle& BufferPool::PageHandle::operator=(PageHandle&& src) {
- DCHECK(!is_open());
- // Copy over all members then close src.
- page_ = src.page_;
- client_ = src.client_;
- src.Reset();
- return *this;
-}
-
-void BufferPool::PageHandle::Open(Page* page, ClientHandle* client) {
- DCHECK(!is_open());
- page_ = page;
- client_ = client;
-}
-
-void BufferPool::PageHandle::Reset() {
- page_ = nullptr;
- client_ = nullptr;
-}
-
-int BufferPool::PageHandle::pin_count() const {
- DCHECK(is_open());
- // The pin count can only be modified via this PageHandle, which must not be
- // concurrently accessed by multiple threads, so it is safe to access without locking
- return page_->pin_count;
-}
-
-int64_t BufferPool::PageHandle::len() const {
- DCHECK(is_open());
- return page_->len; // Does not require locking.
-}
-
-Status BufferPool::PageHandle::GetBuffer(const BufferHandle** buffer) const {
- DCHECK(is_open());
- DCHECK(client_->is_registered());
- DCHECK(is_pinned());
- /*
- if (page_->pin_in_flight) {
- // Finish the work started in Pin().
- RETURN_IF_ERROR(client_->impl_->FinishMoveEvictedToPinned(page_));
- }
-*/
- DCHECK(!page_->pin_in_flight);
- *buffer = &page_->buffer;
- DCHECK((*buffer)->is_open());
- return Status::OK();
-}
-
-BufferPool::BufferPool(int64_t min_buffer_len, int64_t buffer_bytes_limit,
- int64_t clean_page_bytes_limit)
- : allocator_(new BufferAllocator(this, min_buffer_len, buffer_bytes_limit,
- clean_page_bytes_limit)),
- min_buffer_len_(min_buffer_len) {
- CHECK_GT(min_buffer_len, 0);
- CHECK_EQ(min_buffer_len, BitUtil::RoundUpToPowerOfTwo(min_buffer_len));
-}
-
-BufferPool::~BufferPool() {}
-
-Status BufferPool::RegisterClient(const string& name, RuntimeProfile* profile,
- ClientHandle* client) {
- DCHECK(!client->is_registered());
- client->impl_ = new Client(this, //file_group,
- name, profile);
- return Status::OK();
-}
-
-void BufferPool::DeregisterClient(ClientHandle* client) {
- if (!client->is_registered()) return;
- client->impl_->Close(); // Will DCHECK if any remaining buffers or pinned pages.
- delete client->impl_; // Will DCHECK if there are any remaining pages.
- client->impl_ = nullptr;
-}
-
-Status BufferPool::CreatePage(ClientHandle* client, int64_t len, PageHandle* handle,
- const BufferHandle** buffer) {
- DCHECK(!handle->is_open());
- DCHECK_GE(len, min_buffer_len_);
- DCHECK_EQ(len, BitUtil::RoundUpToPowerOfTwo(len));
-
- BufferHandle new_buffer;
- // No changes have been made to state yet, so we can cleanly return on error.
- RETURN_IF_ERROR(AllocateBuffer(client, len, &new_buffer));
- Page* page = client->impl_->CreatePinnedPage(std::move(new_buffer));
- handle->Open(page, client);
- if (buffer != nullptr) *buffer = &page->buffer;
- return Status::OK();
-}
-
-void BufferPool::DestroyPage(ClientHandle* client, PageHandle* handle) {
- if (!handle->is_open()) return; // DestroyPage() should be idempotent.
-
- if (handle->is_pinned()) {
- // Cancel the read I/O - we don't need the data any more.
- //if (handle->page_->pin_in_flight) {
- // handle->page_->write_handle->CancelRead();
- // handle->page_->pin_in_flight = false;
- //}
- // In the pinned case, delegate to ExtractBuffer() and FreeBuffer() to do the work
- // of cleaning up the page, freeing the buffer and updating reservations correctly.
- BufferHandle buffer;
- Status status = ExtractBuffer(client, handle, &buffer);
- DCHECK(status.ok()) << status;
- FreeBuffer(client, &buffer);
- } else {
- // In the unpinned case, no reservations are used so we just clean up the page.
- client->impl_->DestroyPageInternal(handle);
- }
-}
-
-Status BufferPool::Pin(ClientHandle* client, PageHandle* handle) {
- DCHECK(client->is_registered());
- DCHECK(handle->is_open());
- DCHECK_EQ(handle->client_, client);
-
- Page* page = handle->page_;
- if (page->pin_count == 0) {
- RETURN_IF_ERROR(client->impl_->StartMoveToPinned(client, page));
- COUNTER_UPDATE(client->impl_->counters().peak_unpinned_bytes, -page->len);
- }
- // Update accounting last to avoid complicating the error return path above.
- ++page->pin_count;
- return Status::OK();
-}
-
-void BufferPool::Unpin(ClientHandle* client, PageHandle* handle) {
- DCHECK(handle->is_open());
- DCHECK(client->is_registered());
- DCHECK_EQ(handle->client_, client);
- // If handle is pinned, we can assume that the page itself is pinned.
- DCHECK(handle->is_pinned());
- Page* page = handle->page_;
-
- if (--page->pin_count > 0) return;
- //if (page->pin_in_flight) {
- // Data is not in memory - move it back to evicted.
- // client->impl_->UndoMoveEvictedToPinned(page);
- //} else {
- // Data is in memory - move it to dirty unpinned.
- client->impl_->MoveToDirtyUnpinned(page);
- //}
- COUNTER_UPDATE(client->impl_->counters().peak_unpinned_bytes, handle->len());
-}
-
-Status BufferPool::ExtractBuffer(ClientHandle* client, PageHandle* page_handle,
- BufferHandle* buffer_handle) {
- DCHECK(page_handle->is_pinned());
- DCHECK(!buffer_handle->is_open());
- DCHECK_EQ(page_handle->client_, client);
-
- // If an async pin is in flight, we need to wait for it.
- const BufferHandle* dummy;
- RETURN_IF_ERROR(page_handle->GetBuffer(&dummy));
-
- // Bring the pin count to 1 so that we're not using surplus reservations.
- while (page_handle->pin_count() > 1) Unpin(client, page_handle);
-
- // Destroy the page and extract the buffer.
- client->impl_->DestroyPageInternal(page_handle, buffer_handle);
- DCHECK(buffer_handle->is_open());
- return Status::OK();
-}
-
-Status BufferPool::AllocateBuffer(ClientHandle* client, int64_t len, BufferHandle* handle) {
- RETURN_IF_ERROR(client->impl_->PrepareToAllocateBuffer(len));
- Status status = allocator_->Allocate(client, len, handle);
- if (!status.ok()) {
- // Allocation failed - update client's accounting to reflect the failure.
- client->impl_->FreedBuffer(len);
- }
- return status;
-}
-
-void BufferPool::FreeBuffer(ClientHandle* client, BufferHandle* handle) {
- if (!handle->is_open()) return; // Should be idempotent.
- DCHECK_EQ(client, handle->client_);
- int64_t len = handle->len_;
- allocator_->Free(std::move(*handle));
- client->impl_->FreedBuffer(len);
-}
-
-Status BufferPool::TransferBuffer(ClientHandle* src_client, BufferHandle* src,
- ClientHandle* dst_client, BufferHandle* dst) {
- DCHECK(src->is_open());
- DCHECK(!dst->is_open());
- DCHECK_EQ(src_client, src->client_);
- DCHECK_NE(src, dst);
- DCHECK_NE(src_client, dst_client);
-
- *dst = std::move(*src);
- dst->client_ = dst_client;
- return Status::OK();
-}
-
-void BufferPool::Maintenance() {
- allocator_->Maintenance();
-}
-
-void BufferPool::ReleaseMemory(int64_t bytes_to_free) {
- allocator_->ReleaseMemory(bytes_to_free);
-}
-
-int64_t BufferPool::GetSystemBytesLimit() const {
- return allocator_->system_bytes_limit();
-}
-
-int64_t BufferPool::GetSystemBytesAllocated() const {
- return allocator_->GetSystemBytesAllocated();
-}
-
-int64_t BufferPool::GetCleanPageBytesLimit() const {
- return allocator_->GetCleanPageBytesLimit();
-}
-
-int64_t BufferPool::GetNumCleanPages() const {
- return allocator_->GetNumCleanPages();
-}
-
-int64_t BufferPool::GetCleanPageBytes() const {
- return allocator_->GetCleanPageBytes();
-}
-
-int64_t BufferPool::GetNumFreeBuffers() const {
- return allocator_->GetNumFreeBuffers();
-}
-
-int64_t BufferPool::GetFreeBufferBytes() const {
- return allocator_->GetFreeBufferBytes();
-}
-
-bool BufferPool::ClientHandle::has_unpinned_pages() const {
- return impl_->has_unpinned_pages();
-}
-
-BufferPool::Client::Client(BufferPool* pool, //TmpFileMgr::FileGroup* file_group,
- const string& name, RuntimeProfile* profile)
- : pool_(pool),
- //file_group_(file_group),
- name_(name),
- debug_write_delay_ms_(0),
- num_pages_(0),
- buffers_allocated_bytes_(0) {
- // Set up a child profile with buffer pool info.
- RuntimeProfile* child_profile = profile->create_child("Buffer pool", true, true);
- counters_.alloc_time = ADD_TIMER(child_profile, "AllocTime");
- counters_.cumulative_allocations =
- ADD_COUNTER(child_profile, "CumulativeAllocations", TUnit::UNIT);
- counters_.cumulative_bytes_alloced =
- ADD_COUNTER(child_profile, "CumulativeAllocationBytes", TUnit::BYTES);
- counters_.peak_unpinned_bytes =
- child_profile->AddHighWaterMarkCounter("PeakUnpinnedBytes", TUnit::BYTES);
-}
-
-BufferPool::Page* BufferPool::Client::CreatePinnedPage(BufferHandle&& buffer) {
- Page* page = new Page(this, buffer.len());
- page->buffer = std::move(buffer);
- page->pin_count = 1;
-
- std::lock_guard<std::mutex> lock(lock_);
- // The buffer is transferred to the page so will be accounted for in
- // pinned_pages_.bytes() instead of buffers_allocated_bytes_.
- buffers_allocated_bytes_ -= page->len;
- pinned_pages_.enqueue(page);
- ++num_pages_;
- DCHECK_CONSISTENCY();
- return page;
-}
-
-void BufferPool::Client::DestroyPageInternal(PageHandle* handle, BufferHandle* out_buffer) {
- DCHECK(handle->is_pinned() || out_buffer == nullptr);
- Page* page = handle->page_;
- // Remove the page from the list that it is currently present in (if any).
- {
- std::unique_lock<std::mutex> cl(lock_);
- // First try to remove from the pinned or dirty unpinned lists.
- if (!pinned_pages_.remove(page) && !dirty_unpinned_pages_.remove(page)) {
- // The page either has a write in flight, is clean, or is evicted.
- // Let the write complete, if in flight.
- //WaitForWrite(&cl, page);
- // If clean, remove it from the clean pages list. If evicted, this is a no-op.
- pool_->allocator_->RemoveCleanPage(cl, out_buffer != nullptr, page);
- }
- DCHECK(!page->in_queue());
- --num_pages_;
- }
-
- //if (page->write_handle != nullptr) {
- // Discard any on-disk data.
- //file_group_->DestroyWriteHandle(move(page->write_handle));
- //}
- //
- if (out_buffer != nullptr) {
- DCHECK(page->buffer.is_open());
- *out_buffer = std::move(page->buffer);
- buffers_allocated_bytes_ += out_buffer->len();
- } else if (page->buffer.is_open()) {
- pool_->allocator_->Free(std::move(page->buffer));
- }
- delete page;
- handle->Reset();
-}
-
-void BufferPool::Client::MoveToDirtyUnpinned(Page* page) {
- // Only valid to unpin pages if spilling is enabled.
- // DCHECK(spilling_enabled());
- DCHECK_EQ(0, page->pin_count);
-
- std::unique_lock<std::mutex> lock(lock_);
- DCHECK_CONSISTENCY();
- DCHECK(pinned_pages_.contains(page));
- pinned_pages_.remove(page);
- dirty_unpinned_pages_.enqueue(page);
-
- // Check if we should initiate writes for this (or another) dirty page.
- //WriteDirtyPagesAsync();
-}
-
-Status BufferPool::Client::StartMoveToPinned(ClientHandle* client, Page* page) {
- std::unique_lock<std::mutex> cl(lock_);
- DCHECK_CONSISTENCY();
- // Propagate any write errors that occurred for this client.
- //RETURN_IF_ERROR(write_status_i;
-
- if (dirty_unpinned_pages_.remove(page)) {
- // No writes were initiated for the page - just move it back to the pinned state.
- pinned_pages_.enqueue(page);
- return Status::OK();
- }
-
- return Status::InternalError("start move to pinned error, page is not in dirty.");
- /*
- if (in_flight_write_pages_.contains(page)) {
- // A write is in flight. If so, wait for it to complete - then we only have to
- // handle the pinned and evicted cases.
- WaitForWrite(&cl, page);
- RETURN_IF_ERROR(write_status_); // The write may have set 'write_status_'.
- }
-
- // At this point we need to either reclaim a clean page or allocate a new buffer.
- // We may need to clean some pages to do so.
- RETURN_IF_ERROR(CleanPages(&cl, page->len));
- if (pool_->allocator_->RemoveCleanPage(cl, true, page)) {
- // The clean page still has an associated buffer. Restore the data, and move the page
- // back to the pinned state.
- pinned_pages_.enqueue(page);
- DCHECK(page->buffer.is_open());
- DCHECK(page->write_handle != nullptr);
- // Don't need on-disk data.
- cl.unlock(); // Don't block progress for other threads operating on other pages.
- return file_group_->RestoreData(move(page->write_handle), page->buffer.mem_range());
- }
- // If the page wasn't in the clean pages list, it must have been evicted.
- return StartMoveEvictedToPinned(&cl, client, page);
-*/
-}
-/*
-Status BufferPool::Client::StartMoveEvictedToPinned(
- unique_lock<std::mutex>* client_lock, ClientHandle* client, Page* page) {
- DCHECK(!page->buffer.is_open());
-
- // Safe to modify the page's buffer handle without holding the page lock because no
- // concurrent operations can modify evicted pages.
- BufferHandle buffer;
- RETURN_IF_ERROR(pool_->allocator_->Allocate(client, page->len, &page->buffer));
- COUNTER_ADD(counters().bytes_read, page->len);
- COUNTER_ADD(counters().read_io_ops, 1);
- RETURN_IF_ERROR(
- file_group_->ReadAsync(page->write_handle.get(), page->buffer.mem_range()));
- pinned_pages_.enqueue(page);
- page->pin_in_flight = true;
- DCHECK_CONSISTENCY();
- return Status::OK();
-}
-
-void BufferPool::Client::UndoMoveEvictedToPinned(Page* page) {
- // We need to get the page back to the evicted state where:
- // * There is no in-flight read.
- // * The page's data is on disk referenced by 'write_handle'
- // * The page has no attached buffer.
- DCHECK(page->pin_in_flight);
- page->write_handle->CancelRead();
- page->pin_in_flight = false;
-
- unique_lock<std::mutex> lock(lock_);
- DCHECK_CONSISTENCY();
- DCHECK(pinned_pages_.contains(page));
- pinned_pages_.remove(page);
- // Discard the buffer - the pin was in flight so there was no way that a valid
- // reference to the buffer's contents was returned since the pin was still in flight.
- pool_->allocator_->Free(move(page->buffer));
-}
-*/
-/*
-Status BufferPool::Client::FinishMoveEvictedToPinned(Page* page) {
- DCHECK(page->pin_in_flight);
- SCOPED_TIMER(counters().read_wait_time);
- // Don't hold any locks while reading back the data. It is safe to modify the page's
- // buffer handle without holding any locks because no concurrent operations can modify
- // evicted pages.
- RETURN_IF_ERROR(
- file_group_->WaitForAsyncRead(page->write_handle.get(), page->buffer.mem_range()));
- file_group_->DestroyWriteHandle(move(page->write_handle));
- page->pin_in_flight = false;
- return Status::OK();
-}
-*/
-Status BufferPool::Client::PrepareToAllocateBuffer(int64_t len) {
- std::unique_lock<std::mutex> lock(lock_);
- // Clean enough pages to allow allocation to proceed without violating our eviction
- // policy. This can fail, so only update the accounting once success is ensured.
- //RETURN_IF_ERROR(CleanPages(&lock, len));
- buffers_allocated_bytes_ += len;
- DCHECK_CONSISTENCY();
- return Status::OK();
-}
-
-Status BufferPool::Client::CleanPages(std::unique_lock<std::mutex>* client_lock, int64_t len) {
- DCheckHoldsLock(*client_lock);
- DCHECK_CONSISTENCY();
- /*
- // Work out what we need to get bytes of dirty unpinned + in flight pages down to
- // in order to satisfy the eviction policy.
- int64_t target_dirty_bytes = reservation_.GetReservation() - buffers_allocated_bytes_
- - pinned_pages_.bytes() - len;
- // Start enough writes to ensure that the loop condition below will eventually become
- // false (or a write error will be encountered).
- int64_t min_bytes_to_write =
- max<int64_t>(0, dirty_unpinned_pages_.bytes() - target_dirty_bytes);
- //WriteDirtyPagesAsync(min_bytes_to_write);
-
- // One of the writes we initiated, or an earlier in-flight write may have hit an error.
- RETURN_IF_ERROR(write_status_);
-
- // Wait until enough writes have finished so that we can make the allocation without
- // violating the eviction policy. I.e. so that other clients can immediately get the
- // memory they're entitled to without waiting for this client's write to complete.
- DCHECK_GE(in_flight_write_pages_.bytes(), min_bytes_to_write);
- while (dirty_unpinned_pages_.bytes() + in_flight_write_pages_.bytes()
- > target_dirty_bytes) {
- SCOPED_TIMER(counters().write_wait_time);
- write_complete_cv_.Wait(*client_lock);
- RETURN_IF_ERROR(write_status_); // Check if error occurred while waiting.
- }
-*/
- return Status::OK();
-}
-/*
-void BufferPool::Client::WriteDirtyPagesAsync(int64_t min_bytes_to_write) {
- DCHECK_GE(min_bytes_to_write, 0);
- DCHECK_LE(min_bytes_to_write, dirty_unpinned_pages_.bytes());
- // if (file_group_ == nullptr) {
- // Spilling disabled - there should be no unpinned pages to write.
- DCHECK_EQ(0, min_bytes_to_write);
- DCHECK_EQ(0, dirty_unpinned_pages_.bytes());
- return;
-//// }
-
- // No point in starting writes if an error occurred because future operations for the
- // client will fail regardless.
- if (!write_status_.ok()) return;
-
- // Compute the ideal amount of writes to start. We use a simple heuristic based on the
- // total number of writes. The FileGroup's allocation should spread the writes across
- // disks somewhat, but doesn't guarantee we're fully using all available disks. In
- // future we could track the # of writes per-disk.
- const int64_t target_writes = FLAGS_concurrent_scratch_ios_per_device
- * file_group_->tmp_file_mgr()->NumActiveTmpDevices();
-
- int64_t bytes_written = 0;
- while (!dirty_unpinned_pages_.empty()
- && (bytes_written < min_bytes_to_write
- || in_flight_write_pages_.size() < target_writes)) {
- Page* page = dirty_unpinned_pages_.tail(); // LIFO.
- DCHECK(page != nullptr) << "Should have been enough dirty unpinned pages";
- {
- std::lock_guard<SpinLock> pl(page->buffer_lock);
- DCHECK(file_group_ != nullptr);
- DCHECK(page->buffer.is_open());
- COUNTER_ADD(counters().bytes_written, page->len);
- COUNTER_ADD(counters().write_io_ops, 1);
- Status status = file_group_->Write(page->buffer.mem_range(),
- [this, page](const Status& write_status) {
- WriteCompleteCallback(page, write_status);
- },
- &page->write_handle);
- // Exit early on error: there is no point in starting more writes because future
- /// operations for this client will fail regardless.
- if (!status.ok()) {
- write_status_.MergeStatus(status);
- return;
- }
- }
- // Now that the write is in flight, update all the state
- Page* tmp = dirty_unpinned_pages_.pop_back();
- DCHECK_EQ(tmp, page);
- in_flight_write_pages_.enqueue(page);
- bytes_written += page->len;
- }
-}
-
-void BufferPool::Client::WriteCompleteCallback(Page* page, const Status& write_status) {
-#ifndef NDEBUG
- if (debug_write_delay_ms_ > 0) SleepForMs(debug_write_delay_ms_);
-#endif
- {
- std::unique_lock<std::mutex> cl(lock_);
- DCHECK(in_flight_write_pages_.contains(page));
- // The status should always be propagated.
- // TODO: if we add cancellation support to TmpFileMgr, consider cancellation path.
- if (!write_status.ok()) write_status_.MergeStatus(write_status);
- in_flight_write_pages_.remove(page);
- // Move to clean pages list even if an error was encountered - the buffer can be
- // repurposed by other clients and 'write_status_' must be checked by this client
- // before reading back the bad data.
- pool_->allocator_->AddCleanPage(cl, page);
- WriteDirtyPagesAsync(); // Start another asynchronous write if needed.
-
- // Notify before releasing lock to avoid race with Page and Client destruction.
- page->write_complete_cv_.NotifyAll();
- write_complete_cv_.NotifyAll();
- }
-}
-
-void BufferPool::Client::WaitForWrite(std::unique_lock<std::mutex>* client_lock, Page* page) {
- DCheckHoldsLock(*client_lock);
- while (in_flight_write_pages_.contains(page)) {
- SCOPED_TIMER(counters().write_wait_time);
- page->write_complete_cv_.Wait(*client_lock);
- }
-}
-
-void BufferPool::Client::WaitForAllWrites() {
- std::unique_lock<std::mutex> cl(lock_);
- while (in_flight_write_pages_.size() > 0) {
- write_complete_cv_.Wait(cl);
- }
-}
-*/
-string BufferPool::Client::DebugString() {
- std::lock_guard<std::mutex> lock(lock_);
- std::stringstream ss;
- ss << "<BufferPool::Client> " << this << " name: " << name_
- << " write_status: " << write_status_ << " buffers allocated " << buffers_allocated_bytes_
- << " num_pages: " << num_pages_ << " pinned_bytes: " << pinned_pages_.bytes()
- << " dirty_unpinned_bytes: " << dirty_unpinned_pages_.bytes()
- << " in_flight_write_bytes: " << in_flight_write_pages_.bytes();
- ss << "\n " << pinned_pages_.size() << " pinned pages: ";
- pinned_pages_.iterate(std::bind<bool>(Page::DebugStringCallback, &ss, std::placeholders::_1));
- ss << "\n " << dirty_unpinned_pages_.size() << " dirty unpinned pages: ";
- dirty_unpinned_pages_.iterate(
- std::bind<bool>(Page::DebugStringCallback, &ss, std::placeholders::_1));
- ss << "\n " << in_flight_write_pages_.size() << " in flight write pages: ";
- in_flight_write_pages_.iterate(
- std::bind<bool>(Page::DebugStringCallback, &ss, std::placeholders::_1));
- return ss.str();
-}
-
-string BufferPool::ClientHandle::DebugString() const {
- std::stringstream ss;
- if (is_registered()) {
- ss << "<BufferPool::Client> " << this << " internal state: {" << impl_->DebugString()
- << "}";
- return ss.str();
- } else {
- ss << "<BufferPool::ClientHandle> " << this << " UNREGISTERED";
- return ss.str();
- }
-}
-/*
-string BufferPool::PageHandle::DebugString() const {
- if (is_open()) {
- std::lock_guard<SpinLock> pl(page_->buffer_lock);
- return Substitute("<BufferPool::PageHandle> $0 client: $1/$2 page: {$3}", this,
- client_, client_->impl_, page_->DebugString());
- } else {
- return Substitute("<BufferPool::PageHandle> $0 CLOSED", this);
- }
-}
-*/
-string BufferPool::Page::DebugString() {
- std::stringstream ss;
- ss << "<BufferPool::Page> " << this << " len: " << len << " pin_count:" << pin_count
- << " buf:" << buffer.DebugString();
- return ss.str();
-}
-
-bool BufferPool::Page::DebugStringCallback(std::stringstream* ss, BufferPool::Page* page) {
- std::lock_guard<SpinLock> pl(page->buffer_lock);
- (*ss) << page->DebugString() << "\n";
- return true;
-}
-
-string BufferPool::BufferHandle::DebugString() const {
- std::stringstream ss;
- if (is_open()) {
- ss << "<BufferPool::BufferHandle> " << this << " client: " << client_ << "/"
- << client_->impl_ << " data: " << data_ << " len: " << len_;
- } else {
- ss << "<BufferPool::BufferHandle> " << this << " CLOSED";
- }
- return ss.str();
-}
-
-string BufferPool::DebugString() {
- std::stringstream ss;
- ss << "<BufferPool> " << this << " min_buffer_len: " << min_buffer_len_ << "\n"
- << allocator_->DebugString();
- return ss.str();
-}
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/buffer_pool.h b/be/src/runtime/bufferpool/buffer_pool.h
deleted file mode 100644
index 469f5071db..0000000000
--- a/be/src/runtime/bufferpool/buffer_pool.h
+++ /dev/null
@@ -1,466 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <stdint.h>
-
-#include <string>
-#include <vector>
-
-#include "common/compiler_util.h"
-#include "common/object_pool.h"
-#include "common/status.h"
-#include "gutil/dynamic_annotations.h"
-#include "gutil/macros.h"
-#include "util/aligned_new.h"
-#include "util/internal_queue.h"
-#include "util/mem_range.h"
-#include "util/spinlock.h"
-
-namespace doris {
-
-class RuntimeProfile;
-class SystemAllocator;
-class MemTracker;
-
-/// A buffer pool that manages memory buffers for all queries in an Impala daemon.
-/// The buffer pool enforces buffer reservations, limits, and implements policies
-/// for moving spilled memory from in-memory buffers to disk. It also enables reuse of
-/// buffers between queries, to avoid frequent allocations.
-///
-/// The buffer pool can be used for allocating any large buffers (above a configurable
-/// minimum length), whether or not the buffers will be spilled. Smaller allocations
-/// are not serviced directly by the buffer pool: clients of the buffer pool must
-/// subdivide buffers if they wish to use smaller allocations.
-///
-/// All buffer pool operations are in the context of a registered buffer pool client.
-/// A buffer pool client should be created for every allocator of buffers at the level
-/// of granularity required for reporting and enforcement of reservations, e.g. an
-/// operator. The client tracks buffer reservations via its ReservationTracker and also
-/// includes info that is helpful for debugging (e.g. the operator that is associated
-/// with the buffer). Unless otherwise noted, it is not safe to invoke concurrent buffer
-/// pool operations for the same client.
-///
-/// Pages, Buffers and Pinning
-/// ==========================
-/// * A page is a logical block of memory that can reside in memory or on disk.
-/// * A buffer is a physical block of memory that can hold a page in memory.
-/// * A page handle is used by buffer pool clients to identify and access a page and
-/// the corresponding buffer. Clients do not interact with pages directly.
-/// * A buffer handle is used by buffer pool clients to identify and access a buffer.
-/// * A page is pinned if it has pin count > 0. A pinned page stays mapped to the same
-/// buffer.
-/// * An unpinned page can be written out to disk by the buffer pool so that the buffer
-/// can be used for another purpose.
-///
-/// Buffer/Page Sizes
-/// =================
-/// The buffer pool has a minimum buffer size, which must be a power-of-two. Page and
-/// buffer sizes must be an exact power-of-two multiple of the minimum buffer size.
-///
-/// Reservations
-/// ============
-/// Before allocating buffers or pinning pages, a client must reserve memory through its
-/// ReservationTracker. Reservation of n bytes give a client the right to allocate
-/// buffers or pin pages summing up to n bytes. Reservations are both necessary and
-/// sufficient for a client to allocate buffers or pin pages: the operations succeed
-/// unless a "system error" such as a disk write error is encountered that prevents
-/// unpinned pages from being written to disk.
-///
-/// More memory may be reserved than is used, e.g. if a client is not using its full
-/// reservation. In such cases, the buffer pool can use the free buffers in any way,
-/// e.g. for keeping unpinned pages in memory, so long as it is able to fulfill the
-/// reservations when needed, e.g. by flushing unpinned pages to disk.
-///
-/// Page/Buffer Handles
-/// ===================
-/// The buffer pool exposes PageHandles and BufferHandles, which are owned by clients of
-/// the buffer pool, and act as a proxy for the internal data structure representing the
-/// page or buffer in the buffer pool. Handles are "open" if they are associated with a
-/// page or buffer. An open PageHandle is obtained by creating a page. PageHandles are
-/// closed by calling BufferPool::DestroyPage(). An open BufferHandle is obtained by
-/// allocating a buffer or extracting a BufferHandle from a PageHandle. The buffer of a
-/// pinned page can also be accessed through the PageHandle. The handle destructors check
-/// for resource leaks, e.g. an open handle that would result in a buffer leak.
-///
-/// Pin Counting of Page Handles:
-/// ----------------------------------
-/// Page handles are scoped to a client. The invariants are as follows:
-/// * A page can only be accessed through an open handle.
-/// * A page is destroyed once the handle is destroyed via DestroyPage().
-/// * A page's buffer can only be accessed through a pinned handle.
-/// * Pin() can be called on an open handle, incrementing the handle's pin count.
-/// * Unpin() can be called on a pinned handle, but not an unpinned handle.
-/// * Pin() always increases usage of reservations, and Unpin() always decreases usage,
-/// i.e. the handle consumes <pin count> * <page size> bytes of reservation.
-///
-/// Example Usage: Buffers
-/// ==================================
-/// The simplest use case is to allocate a memory buffer.
-/// * The new buffer is created with AllocateBuffer().
-/// * The client reads and writes to the buffer as it sees fit.
-/// * If the client is done with the buffer's contents it can call FreeBuffer() to
-/// destroy the handle and free the buffer, or use TransferBuffer() to transfer
-/// the buffer to a different client.
-///
-/// Example Usage: Spillable Pages
-/// ==============================
-/// * In order to spill pages to disk, the Client must be registered with a FileGroup,
-/// which is used to allocate scratch space on disk.
-/// * A spilling operator creates a new page with CreatePage().
-/// * The client reads and writes to the page's buffer as it sees fit.
-/// * If the operator encounters memory pressure, it can decrease reservation usage by
-/// calling Unpin() on the page. The page may then be written to disk and its buffer
-/// repurposed internally by BufferPool.
-/// * Once the operator needs the page's contents again and has sufficient unused
-/// reservation, it can call Pin(), which brings the page's contents back into memory,
-/// perhaps in a different buffer. Therefore the operator must fix up any pointers into
-/// the previous buffer. Pin() executes asynchronously - the caller only blocks waiting
-/// for read I/O if it calls GetBuffer() or ExtractBuffer() while the read is in
-/// flight.
-/// * If the operator is done with the page, it can call DestroyPage() to destroy the
-/// handle and release resources, or call ExtractBuffer() to extract the buffer.
-///
-/// Synchronization
-/// ===============
-/// The data structures in the buffer pool itself are thread-safe. Client-owned data
-/// structures - Client, PageHandle and BufferHandle - are not protected from concurrent
-/// accesses. Clients must ensure that they do not invoke concurrent operations with the
-/// same Client, PageHandle or BufferHandle.
-class BufferPool : public CacheLineAligned {
-public:
- struct BufferAllocator;
- class BufferHandle;
- class ClientHandle;
- class PageHandle;
- /// Constructs a new buffer pool.
- /// 'min_buffer_len': the minimum buffer length for the pool. Must be a power of two.
- /// 'buffer_bytes_limit': the maximum physical memory in bytes that can be used by the
- /// buffer pool. If 'buffer_bytes_limit' is not a multiple of 'min_buffer_len', the
- /// remainder will not be usable.
- /// 'clean_page_bytes_limit': the maximum bytes of clean pages that will be retained by the
- /// buffer pool.
- BufferPool(int64_t min_buffer_len, int64_t buffer_bytes_limit, int64_t clean_page_bytes_limit);
- ~BufferPool();
-
- /// Register a client. Returns an error status and does not register the client if the
- /// arguments are invalid. 'name' is an arbitrary name used to identify the client in
- /// any errors messages or logging. If 'file_group' is non-nullptr, it is used to allocate
- /// scratch space to write unpinned pages to disk. If it is nullptr, unpinning of pages is
- /// not allowed for this client. Counters for this client are added to the (non-nullptr)
- /// 'profile'. 'client' is the client to register. 'client' must not already be
- /// registered.
- Status RegisterClient(const std::string& name, RuntimeProfile* profile,
- ClientHandle* client) WARN_UNUSED_RESULT;
-
- /// Deregister 'client' if it is registered. All pages must be destroyed and buffers
- /// must be freed for the client before calling this. Releases any reservation that
- /// belongs to the client. Idempotent.
- void DeregisterClient(ClientHandle* client);
-
- /// Create a new page of 'len' bytes with pin count 1. 'len' must be a page length
- /// supported by BufferPool (see BufferPool class comment). The client must have
- /// sufficient unused reservation to pin the new page (otherwise it will DCHECK).
- /// CreatePage() only fails when a system error prevents the buffer pool from fulfilling
- /// the reservation.
- /// On success, the handle is mapped to the new page and 'buffer', if non-nullptr, is set
- /// to the page's buffer.
- Status CreatePage(ClientHandle* client, int64_t len, PageHandle* handle,
- const BufferHandle** buffer = nullptr) WARN_UNUSED_RESULT;
-
- /// Increment the pin count of 'handle'. After Pin() the underlying page will
- /// be mapped to a buffer, which will be accessible through 'handle'. If the data
- /// was evicted from memory, it will be read back into memory asynchronously.
- /// Attempting to access the buffer with ExtractBuffer() or handle.GetBuffer() will
- /// block until the data is in memory. The caller is responsible for ensuring it has
- /// enough unused reservation before calling Pin() (otherwise it will DCHECK). Pin()
- /// only fails when a system error prevents the buffer pool from fulfilling the
- /// reservation or if an I/O error is encountered reading back data from disk.
- /// 'handle' must be open.
- Status Pin(ClientHandle* client, PageHandle* handle) WARN_UNUSED_RESULT;
-
- /// Decrement the pin count of 'handle'. Decrease client's reservation usage. If the
- /// handle's pin count becomes zero, it is no longer valid for the underlying page's
- /// buffer to be accessed via 'handle'. If the page's total pin count across all
- /// handles that reference it goes to zero, the page's data may be written to disk and
- /// the buffer reclaimed. 'handle' must be open and have a pin count > 0.
- ///
- /// It is an error to reduce the pin count to 0 if 'client' does not have an associated
- /// FileGroup.
- void Unpin(ClientHandle* client, PageHandle* handle);
-
- /// Destroy the page referenced by 'handle' (if 'handle' is open). Any buffers or disk
- /// storage backing the page are freed. Idempotent. If the page is pinned, the
- /// reservation usage is decreased accordingly.
- void DestroyPage(ClientHandle* client, PageHandle* handle);
-
- /// Extracts buffer from a pinned page. After this returns, the page referenced by
- /// 'page_handle' will be destroyed and 'buffer_handle' will reference the buffer from
- /// 'page_handle'. This may decrease reservation usage of 'client' if the page was
- /// pinned multiple times via 'page_handle'. May return an error if 'page_handle' was
- /// unpinned earlier with no subsequent GetBuffer() call and a read error is
- /// encountered while bringing the page back into memory.
- Status ExtractBuffer(ClientHandle* client, PageHandle* page_handle,
- BufferHandle* buffer_handle) WARN_UNUSED_RESULT;
-
- /// Allocates a new buffer of 'len' bytes. Uses reservation from 'client'. The caller
- /// is responsible for ensuring it has enough unused reservation before calling
- /// AllocateBuffer() (otherwise it will DCHECK). AllocateBuffer() only fails when
- /// a system error prevents the buffer pool from fulfilling the reservation.
- Status AllocateBuffer(ClientHandle* client, int64_t len,
- BufferHandle* handle) WARN_UNUSED_RESULT;
-
- /// If 'handle' is open, close 'handle', free the buffer and decrease the reservation
- /// usage from 'client'. Idempotent. Safe to call concurrently with any other
- /// operations for 'client'.
- void FreeBuffer(ClientHandle* client, BufferHandle* handle);
-
- /// Transfer ownership of buffer from 'src_client' to 'dst_client' and move the
- /// handle from 'src' to 'dst'. Increases reservation usage in 'dst_client' and
- /// decreases reservation usage in 'src_client'. 'src' must be open and 'dst' must be
- /// closed before calling. 'src'/'dst' and 'src_client'/'dst_client' must be different.
- /// After a successful call, 'src' is closed and 'dst' is open. Safe to call
- /// concurrently with any other operations for 'src_client'.
- Status TransferBuffer(ClientHandle* src_client, BufferHandle* src, ClientHandle* dst_client,
- BufferHandle* dst) WARN_UNUSED_RESULT;
-
- /// Try to release at least 'bytes_to_free' bytes of memory to the system allocator.
- /// TODO: once IMPALA-4834 is done and all large allocations are served from the buffer
- /// pool, this may not be necessary.
- void ReleaseMemory(int64_t bytes_to_free);
-
- /// Called periodically by a maintenance thread to release unused memory back to the
- /// system allocator.
- void Maintenance();
-
- /// Print a debug string with the state of the buffer pool.
- std::string DebugString();
-
- int64_t min_buffer_len() const { return min_buffer_len_; }
- int64_t GetSystemBytesLimit() const;
- int64_t GetSystemBytesAllocated() const;
-
- /// Return the limit on bytes of clean pages in the pool.
- int64_t GetCleanPageBytesLimit() const;
-
- /// Return the total number of clean pages in the pool.
- int64_t GetNumCleanPages() const;
-
- /// Return the total bytes of clean pages in the pool.
- int64_t GetCleanPageBytes() const;
-
- /// Return the total number of free buffers in the pool.
- int64_t GetNumFreeBuffers() const;
-
- /// Return the total bytes of free buffers in the pool.
- int64_t GetFreeBufferBytes() const;
-
- /// Generous upper bounds on page and buffer size and the number of different
- /// power-of-two buffer sizes.
- static constexpr int LOG_MAX_BUFFER_BYTES = 48;
- static constexpr int64_t MAX_BUFFER_BYTES = 1L << LOG_MAX_BUFFER_BYTES;
-
-protected:
- friend class BufferPoolTest;
- /// Test helper: get a reference to the allocator.
- BufferAllocator* allocator() { return allocator_.get(); }
-
-private:
- DISALLOW_COPY_AND_ASSIGN(BufferPool);
- class Client;
- class FreeBufferArena;
- class PageList;
- class Page;
-
- /// Allocator for allocating and freeing all buffer memory and managing lists of free
- /// buffers and clean pages.
- std::unique_ptr<BufferAllocator> allocator_;
-
- /// The minimum length of a buffer in bytes. All buffers and pages are a power-of-two
- /// multiple of this length. This is always a power of two.
- const int64_t min_buffer_len_;
-};
-
-/// External representation of a client of the BufferPool. Clients are used for
-/// reservation accounting, and will be used in the future for tracking per-client
-/// buffer pool counters. This class is the external handle for a client so
-/// each Client instance is owned by the BufferPool's client, rather than the BufferPool.
-/// Each Client should only be used by a single thread at a time: concurrently calling
-/// Client methods or BufferPool methods with the Client as an argument is not supported.
-class BufferPool::ClientHandle {
-public:
- ClientHandle() : impl_(nullptr) {}
- /// Client must be deregistered.
- ~ClientHandle() { DCHECK(!is_registered()); }
-
- bool is_registered() const { return impl_ != nullptr; }
-
- /// Return true if there are any unpinned pages for this client.
- bool has_unpinned_pages() const;
-
- std::string DebugString() const;
-
-private:
- friend class BufferPool;
- friend class BufferPoolTest;
- DISALLOW_COPY_AND_ASSIGN(ClientHandle);
-
- /// Internal state for the client. nullptr means the client isn't registered.
- /// Owned by BufferPool.
- Client* impl_;
-};
-
-/// A handle to a buffer allocated from the buffer pool. Each BufferHandle should only
-/// be used by a single thread at a time: concurrently calling BufferHandle methods or
-/// BufferPool methods with the BufferHandle as an argument is not supported.
-class BufferPool::BufferHandle {
-public:
- BufferHandle() { Reset(); }
- ~BufferHandle() { DCHECK(!is_open()); }
-
- /// Allow move construction of handles to support std::move(). Inline to make moving
- /// efficient.
- BufferHandle(BufferHandle&& src);
-
- /// Allow move assignment of handles to support STL classes like std::vector.
- /// Destination must be uninitialized. Inline to make moving efficient.
- BufferHandle& operator=(BufferHandle&& src);
-
- bool is_open() const { return data_ != nullptr; }
- int64_t len() const {
- DCHECK(is_open());
- return len_;
- }
- /// Get a pointer to the start of the buffer.
- uint8_t* data() const {
- DCHECK(is_open());
- return data_;
- }
-
- MemRange mem_range() const { return MemRange(data(), len()); }
-
- std::string DebugString() const;
-
- /// Poison the memory associated with this handle. If ASAN is not enabled, this is a
- /// no-op.
- void Poison() { ASAN_POISON_MEMORY_REGION(data(), len()); }
-
- /// Unpoison the memory associated with this handle. If ASAN is not enabled, this is a
- /// no-op.
- void Unpoison() { ASAN_UNPOISON_MEMORY_REGION(data(), len()); }
-
-private:
- DISALLOW_COPY_AND_ASSIGN(BufferHandle);
- friend class BufferPool;
- friend class SystemAllocator;
-
- /// Internal helper to set the handle to an opened state.
- void Open(uint8_t* data, int64_t len, int home_core);
-
- /// Internal helper to reset the handle to an unopened state. Inlined to make moving
- /// efficient.
- void Reset();
-
- /// The client the buffer handle belongs to, used to validate that the correct client
- /// is provided in BufferPool method calls. Set to nullptr if the buffer is in a free list.
- const ClientHandle* client_;
-
- /// Pointer to the start of the buffer. Non-nullptr if open, nullptr if closed.
- uint8_t* data_;
-
- /// Length of the buffer in bytes.
- int64_t len_;
-
- /// The CPU core that the buffer was allocated from - used to determine which arena
- /// it will be added to.
- int home_core_;
-};
-
-/// The handle for a page used by clients of the BufferPool. Each PageHandle should
-/// only be used by a single thread at a time: concurrently calling PageHandle methods
-/// or BufferPool methods with the PageHandle as an argument is not supported.
-class BufferPool::PageHandle {
-public:
- PageHandle();
- ~PageHandle() { DCHECK(!is_open()); }
-
- // Allow move construction of page handles, to support std::move().
- PageHandle(PageHandle&& src);
-
- // Allow move assignment of page handles, to support STL classes like std::vector.
- // Destination must be closed.
- PageHandle& operator=(PageHandle&& src);
-
- bool is_open() const { return page_ != nullptr; }
- bool is_pinned() const { return pin_count() > 0; }
- int pin_count() const;
- int64_t len() const;
-
- /// Get a reference to the page's buffer handle. Only valid to call if the page is
- /// pinned. If the page was previously unpinned and the read I/O for the data is still
- /// in flight, this can block waiting. Returns an error if an error was encountered
- /// reading the data back, which can only happen if Unpin() was called on the page
- /// since the last call to GetBuffer(). Only const accessors of the returned handle can
- /// be used: it is invalid to call FreeBuffer() or TransferBuffer() on it or to
- /// otherwise modify the handle.
- Status GetBuffer(const BufferHandle** buffer_handle) const WARN_UNUSED_RESULT;
-
- std::string DebugString() const;
-
-private:
- DISALLOW_COPY_AND_ASSIGN(PageHandle);
- friend class BufferPool;
- friend class BufferPoolTest;
- friend class Page;
-
- /// Internal helper to open the handle for the given page.
- void Open(Page* page, ClientHandle* client);
-
- /// Internal helper to reset the handle to an unopened state.
- void Reset();
-
- /// The internal page structure. nullptr if the handle is not open.
- Page* page_;
-
- /// The client the page handle belongs to.
- ClientHandle* client_;
-};
-
-inline BufferPool::BufferHandle::BufferHandle(BufferHandle&& src) {
- Reset();
- *this = std::move(src);
-}
-
-inline BufferPool::BufferHandle& BufferPool::BufferHandle::operator=(BufferHandle&& src) {
- DCHECK(!is_open());
- // Copy over all members then close src.
- client_ = src.client_;
- data_ = src.data_;
- len_ = src.len_;
- home_core_ = src.home_core_;
- src.Reset();
- return *this;
-}
-
-inline void BufferPool::BufferHandle::Reset() {
- client_ = nullptr;
- data_ = nullptr;
- len_ = -1;
- home_core_ = -1;
-}
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/buffer_pool_counters.h b/be/src/runtime/bufferpool/buffer_pool_counters.h
deleted file mode 100644
index 7e3ccb79bc..0000000000
--- a/be/src/runtime/bufferpool/buffer_pool_counters.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef DORIS_BE_RUNTIME_BUFFER_POOL_COUNTERS_H
-#define DORIS_BE_RUNTIME_BUFFER_POOL_COUNTERS_H
-
-#include "util/runtime_profile.h"
-
-namespace doris {
-
-/// A set of counters for each buffer pool client.
-struct BufferPoolClientCounters {
-public:
- /// Total amount of time spent inside BufferAllocator::AllocateBuffer().
- RuntimeProfile::Counter* alloc_time;
-
- /// Number of buffers allocated via BufferAllocator::AllocateBuffer().
- RuntimeProfile::Counter* cumulative_allocations;
-
- /// Bytes of buffers allocated via BufferAllocator::AllocateBuffer().
- RuntimeProfile::Counter* cumulative_bytes_alloced;
-
- /// The peak total size of unpinned pages.
- RuntimeProfile::HighWaterMarkCounter* peak_unpinned_bytes;
-};
-
-} // namespace doris
-
-#endif
diff --git a/be/src/runtime/bufferpool/buffer_pool_internal.h b/be/src/runtime/bufferpool/buffer_pool_internal.h
deleted file mode 100644
index 2b0a083268..0000000000
--- a/be/src/runtime/bufferpool/buffer_pool_internal.h
+++ /dev/null
@@ -1,299 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-#include <mutex>
-#include <sstream>
-
-#include "runtime/bufferpool/buffer_pool.h"
-#include "runtime/bufferpool/buffer_pool_counters.h"
-
-// Ensure that DCheckConsistency() function calls get removed in release builds.
-#ifndef NDEBUG
-#define DCHECK_CONSISTENCY() DCheckConsistency()
-#else
-#define DCHECK_CONSISTENCY()
-#endif
-
-namespace doris {
-
-/// The internal representation of a page, which can be pinned or unpinned. See the
-/// class comment for explanation of the different page states.
-class BufferPool::Page : public InternalList<Page>::Node {
-public:
- Page(Client* client, int64_t len)
- : client(client), len(len), pin_count(0), pin_in_flight(false) {}
-
- std::string DebugString();
-
- // Helper for BufferPool::DebugString().
- static bool DebugStringCallback(std::stringstream* ss, BufferPool::Page* page);
-
- /// The client that the page belongs to.
- Client* const client;
-
- /// The length of the page in bytes.
- const int64_t len;
-
- /// The pin count of the page. Only accessed in contexts that are passed the associated
- /// PageHandle, so it cannot be accessed by multiple threads concurrently.
- int pin_count;
-
- /// True if the read I/O to pin the page was started but not completed. Only accessed
- /// in contexts that are passed the associated PageHandle, so it cannot be accessed
- /// by multiple threads concurrently.
- bool pin_in_flight;
-
- /// Non-null if there is a write in flight, the page is clean, or the page is evicted.
- //std::unique_ptr<TmpFileMgr::WriteHandle> write_handle;
-
- /// This lock must be held when accessing 'buffer' if the page is unpinned and not
- /// evicted (i.e. it is safe to access 'buffer' if the page is pinned or evicted).
- SpinLock buffer_lock;
-
- /// Buffer with the page's contents. Closed only iff page is evicted. Open otherwise.
- BufferHandle buffer;
-};
-
-/// Wrapper around InternalList<Page> that tracks the # of bytes in the list.
-class BufferPool::PageList {
-public:
- PageList() : bytes_(0) {}
- ~PageList() {
- // Clients always empty out their list before destruction.
- DCHECK(list_.empty());
- DCHECK_EQ(0, bytes_);
- }
-
- void enqueue(Page* page) {
- list_.enqueue(page);
- bytes_ += page->len;
- }
-
- bool remove(Page* page) {
- if (list_.remove(page)) {
- bytes_ -= page->len;
- return true;
- }
- return false;
- }
-
- Page* dequeue() {
- Page* page = list_.dequeue();
- if (page != nullptr) {
- bytes_ -= page->len;
- }
- return page;
- }
-
- Page* pop_back() {
- Page* page = list_.pop_back();
- if (page != nullptr) {
- bytes_ -= page->len;
- }
- return page;
- }
-
- void iterate(std::function<bool(Page*)> fn) { list_.iterate(fn); }
- bool contains(Page* page) { return list_.contains(page); }
- Page* tail() { return list_.tail(); }
- bool empty() const { return list_.empty(); }
- int size() const { return list_.size(); }
- int64_t bytes() const { return bytes_; }
-
- void DCheckConsistency() {
- DCHECK_GE(bytes_, 0);
- DCHECK_EQ(list_.empty(), bytes_ == 0);
- }
-
-private:
- InternalList<Page> list_;
- int64_t bytes_;
-};
-
-/// The internal state for the client.
-class BufferPool::Client {
-public:
- Client(BufferPool* pool, //TmpFileMgr::FileGroup* file_group,
- const std::string& name, RuntimeProfile* profile);
-
- ~Client() {
- DCHECK_EQ(0, num_pages_);
- DCHECK_EQ(0, buffers_allocated_bytes_);
- }
-
- void Close() {}
-
- /// Create a pinned page using 'buffer', which was allocated using AllocateBuffer().
- /// No client or page locks should be held by the caller.
- Page* CreatePinnedPage(BufferHandle&& buffer);
-
- /// Reset 'handle', clean up references to handle->page and release any resources
- /// associated with handle->page. If the page is pinned, 'out_buffer' can be passed in
- /// and the page's buffer will be returned.
- /// Neither the client's lock nor handle->page_->buffer_lock should be held by the
- /// caller.
- void DestroyPageInternal(PageHandle* handle, BufferHandle* out_buffer = nullptr);
-
- /// Updates client state to reflect that 'page' is now a dirty unpinned page. May
- /// initiate writes for this or other dirty unpinned pages.
- /// Neither the client's lock nor page->buffer_lock should be held by the caller.
- void MoveToDirtyUnpinned(Page* page);
-
- /// Move an unpinned page to the pinned state, moving between data structures and
- /// reading from disk if necessary. Ensures the page has a buffer. If the data is
- /// already in memory, ensures the data is in the page's buffer. If the data is on
- /// disk, starts an async read of the data and sets 'pin_in_flight' on the page to
- /// true. Neither the client's lock nor page->buffer_lock should be held by the caller.
- Status StartMoveToPinned(ClientHandle* client, Page* page) WARN_UNUSED_RESULT;
-
- /// Moves a page that has a pin in flight back to the evicted state, undoing
- /// StartMoveToPinned(). Neither the client's lock nor page->buffer_lock should be held
- /// by the caller.
- //void UndoMoveEvictedToPinned(Page* page);
-
- /// Finish the work of bring the data of an evicted page to memory if
- /// page->pin_in_flight was set to true by StartMoveToPinned().
- //Status FinishMoveEvictedToPinned(Page* page) WARN_UNUSED_RESULT;
-
- /// Must be called once before allocating a buffer of 'len' via the AllocateBuffer()
- /// API to deduct from the client's reservation and update internal accounting. Cleans
- /// dirty pages if needed to satisfy the buffer pool's internal invariants. No page or
- /// client locks should be held by the caller.
- Status PrepareToAllocateBuffer(int64_t len) WARN_UNUSED_RESULT;
-
- /// Called after a buffer of 'len' is freed via the FreeBuffer() API to update
- /// internal accounting and release the buffer to the client's reservation. No page or
- /// client locks should be held by the caller.
- void FreedBuffer(int64_t len) {
- std::lock_guard<std::mutex> cl(lock_);
- buffers_allocated_bytes_ -= len;
- DCHECK_CONSISTENCY();
- }
-
- /// Wait for the in-flight write for 'page' to complete.
- /// 'lock_' must be held by the caller via 'client_lock'. page->buffer_lock should
- /// not be held.
- //void WaitForWrite(std::unique_lock<std::mutex>* client_lock, Page* page);
-
- /// Test helper: wait for all in-flight writes to complete.
- /// 'lock_' must not be held by the caller.
- //void WaitForAllWrites();
-
- /// Asserts that 'client_lock' is holding 'lock_'.
- void DCheckHoldsLock(const std::unique_lock<std::mutex>& client_lock) {
- DCHECK(client_lock.mutex() == &lock_ && client_lock.owns_lock());
- }
-
- const BufferPoolClientCounters& counters() const { return counters_; }
- //bool spilling_enabled() const { return file_group_ != nullptr; }
- void set_debug_write_delay_ms(int val) { debug_write_delay_ms_ = val; }
- bool has_unpinned_pages() const {
- // Safe to read without lock since other threads should not be calling BufferPool
- // functions that create, destroy or unpin pages.
- return pinned_pages_.size() < num_pages_;
- }
-
- std::string DebugString();
-
-private:
- // Check consistency of client, DCHECK if inconsistent. 'lock_' must be held.
- void DCheckConsistency() {
- DCHECK_GE(buffers_allocated_bytes_, 0);
- pinned_pages_.DCheckConsistency();
- dirty_unpinned_pages_.DCheckConsistency();
- in_flight_write_pages_.DCheckConsistency();
- DCHECK_LE(
- pinned_pages_.size() + dirty_unpinned_pages_.size() + in_flight_write_pages_.size(),
- num_pages_);
- }
-
- /// Must be called once before allocating or reclaiming a buffer of 'len'. Ensures that
- /// enough dirty pages are flushed to disk to satisfy the buffer pool's internal
- /// invariants after the allocation. 'lock_' should be held by the caller via
- /// 'client_lock'
- Status CleanPages(std::unique_lock<std::mutex>* client_lock, int64_t len);
-
- /// Initiates asynchronous writes of dirty unpinned pages to disk. Ensures that at
- /// least 'min_bytes_to_write' bytes of writes will be written asynchronously. May
- /// start writes more aggressively so that I/O and compute can be overlapped. If
- /// any errors are encountered, 'write_status_' is set. 'write_status_' must therefore
- /// be checked before reading back any pages. 'lock_' must be held by the caller.
- //void WriteDirtyPagesAsync(int64_t min_bytes_to_write = 0);
-
- /// Called when a write for 'page' completes.
- //void WriteCompleteCallback(Page* page, const Status& write_status);
-
- /// Move an evicted page to the pinned state by allocating a new buffer, starting an
- /// async read from disk and moving the page to 'pinned_pages_'. client->impl must be
- /// locked by the caller via 'client_lock' and handle->page must be unlocked.
- /// 'client_lock' is released then reacquired.
- //Status StartMoveEvictedToPinned(
- // std::unique_lock<std::mutex>* client_lock, ClientHandle* client, Page* page);
-
- /// The buffer pool that owns the client.
- BufferPool* const pool_;
-
- /// The file group that should be used for allocating scratch space. If nullptr, spilling
- /// is disabled.
- //TmpFileMgr::FileGroup* const file_group_;
-
- /// A name identifying the client.
- const std::string name_;
-
- /// The RuntimeProfile counters for this client, owned by the client's RuntimeProfile.
- /// All non-nullptr.
- BufferPoolClientCounters counters_;
-
- /// Debug option to delay write completion.
- int debug_write_delay_ms_;
-
- /// Lock to protect the below member variables;
- std::mutex lock_;
-
- /// All non-OK statuses returned by write operations are merged into this status.
- /// All operations that depend on pages being written to disk successfully (e.g.
- /// reading pages back from disk) must check 'write_status_' before proceeding, so
- /// that write errors that occurred asynchronously are correctly propagated. The
- /// write error is global to the client so can be propagated to any Status-returning
- /// operation for the client (even for operations on different Pages or Buffers).
- /// Write errors are not recoverable so it is best to propagate them as quickly
- /// as possible, instead of waiting to propagate them in a specific way.
- Status write_status_;
-
- /// Total number of pages for this client. Used for debugging and enforcing that all
- /// pages are destroyed before the client.
- int64_t num_pages_;
-
- /// Total bytes of buffers in BufferHandles returned to clients (i.e. obtained from
- /// AllocateBuffer() or ExtractBuffer()).
- int64_t buffers_allocated_bytes_;
-
- /// All pinned pages for this client.
- PageList pinned_pages_;
-
- /// Dirty unpinned pages for this client for which writes are not in flight. Page
- /// writes are started in LIFO order, because operators typically have sequential access
- /// patterns where the most recently evicted page will be last to be read.
- PageList dirty_unpinned_pages_;
-
- /// Dirty unpinned pages for this client for which writes are in flight.
- PageList in_flight_write_pages_;
-};
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/free_list.h b/be/src/runtime/bufferpool/free_list.h
deleted file mode 100644
index 7121033de8..0000000000
--- a/be/src/runtime/bufferpool/free_list.h
+++ /dev/null
@@ -1,115 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cstdint>
-#include <vector>
-
-#include "common/logging.h"
-#include "gutil/macros.h"
-#include "runtime/bufferpool/buffer_pool.h"
-
-namespace doris {
-
-using BufferHandle = BufferPool::BufferHandle;
-
-/// A non-threadsafe list of free buffers.
-///
-/// Buffers are allocated by the caller and can be added to the list for later retrieval
-/// with AddFreeBuffer(). If the list is non-empty, calling PopFreeBuffer() will return
-/// one of the buffers previously added to the list. FreeList is agnostic about the size
-/// or other properties of the buffers added to it.
-///
-/// Buffers in the list can be freed at any point, e.g. if the list is storing too many
-/// free buffers (according to some policy). The caller is responsible for implementing
-/// the policy and calling FreeBuffers() or FreeAll() at the appropriate times.
-///
-/// Address space fragmentation
-/// ---------------------------
-/// To reduce memory fragmentation, the free list hands out buffers with lower memory
-/// addresses first and frees buffers with higher memory address first. If buffers were
-/// handed out by a policy that didn't take memory address into account, over time the
-/// distribution of free buffers within the address space would become essentially
-/// random. If free buffers were then unmapped, there would be many holes in the virtual
-/// memory map, which can cause difficulties for the OS in some cases, e.g. exceeding the
-/// maximum number of mmapped() regions (vm.max_map_count) in Linux. Using this approach
-/// will tend to consolidate free buffers in higher parts of the address space, allowing
-/// coalescing of the holes in most cases.
-class FreeList {
-public:
- FreeList() {}
-
- /// Gets a free buffer. If the list is non-empty, returns true and sets 'buffer' to
- /// one of the buffers previously added with AddFreeBuffer(). Otherwise returns false.
- bool PopFreeBuffer(BufferHandle* buffer) {
- if (free_list_.empty()) return false;
- std::pop_heap(free_list_.begin(), free_list_.end(), HeapCompare);
- *buffer = std::move(free_list_.back());
- free_list_.pop_back();
- return true;
- }
-
- /// Adds a free buffer to the list.
- void AddFreeBuffer(BufferHandle&& buffer) {
- buffer.Poison();
- free_list_.emplace_back(std::move(buffer));
- std::push_heap(free_list_.begin(), free_list_.end(), HeapCompare);
- }
-
- /// Get the 'num_buffers' buffers with the highest memory address from the list to
- /// free. The average time complexity is n log n, where n is the current size of the
- /// list.
- std::vector<BufferHandle> GetBuffersToFree(int64_t num_buffers) {
- std::vector<BufferHandle> buffers;
- DCHECK_LE(num_buffers, free_list_.size());
- // Sort the list so we can free the buffers with higher memory addresses.
- // Note that the sorted list is still a valid min-heap.
- std::sort(free_list_.begin(), free_list_.end(), SortCompare);
-
- for (int64_t i = 0; i < num_buffers; ++i) {
- buffers.emplace_back(std::move(free_list_.back()));
- free_list_.pop_back();
- }
- return buffers;
- }
-
- /// Returns the number of buffers currently in the list.
- int64_t Size() const { return free_list_.size(); }
-
-private:
- friend class FreeListTest;
-
- DISALLOW_COPY_AND_ASSIGN(FreeList);
-
- /// Compare function that orders by memory address.
- static bool SortCompare(const BufferHandle& b1, const BufferHandle& b2) {
- return b1.data() < b2.data();
- }
-
- /// Compare function that orders by memory address. Needs to be inverse of SortCompare()
- /// because C++ provides a max-heap.
- static bool HeapCompare(const BufferHandle& b1, const BufferHandle& b2) {
- return SortCompare(b2, b1);
- }
-
- /// List of free memory buffers. Maintained as a min-heap ordered by the memory address
- /// of the buffer.
- std::vector<BufferHandle> free_list_;
-};
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/suballocator.cc b/be/src/runtime/bufferpool/suballocator.cc
deleted file mode 100644
index f26aee6205..0000000000
--- a/be/src/runtime/bufferpool/suballocator.cc
+++ /dev/null
@@ -1,252 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/bufferpool/suballocator.h"
-
-#include <new>
-
-#include "gutil/strings/substitute.h"
-#include "util/bit_util.h"
-
-namespace doris {
-
-constexpr int Suballocator::LOG_MAX_ALLOCATION_BYTES;
-constexpr int64_t Suballocator::MAX_ALLOCATION_BYTES;
-constexpr int Suballocator::LOG_MIN_ALLOCATION_BYTES;
-constexpr int64_t Suballocator::MIN_ALLOCATION_BYTES;
-//const int Suballocator::NUM_FREE_LISTS;
-
-Suballocator::Suballocator(BufferPool* pool, BufferPool::ClientHandle* client,
- int64_t min_buffer_len)
- : pool_(pool), client_(client), min_buffer_len_(min_buffer_len), allocated_(0) {}
-
-Suballocator::~Suballocator() {
- // All allocations should be free and buffers deallocated.
- DCHECK_EQ(allocated_, 0);
- for (int i = 0; i < NUM_FREE_LISTS; ++i) {
- DCHECK(free_lists_[i] == nullptr);
- }
-}
-
-Status Suballocator::Allocate(int64_t bytes, std::unique_ptr<Suballocation>* result) {
- DCHECK_GE(bytes, 0);
- if (UNLIKELY(bytes > MAX_ALLOCATION_BYTES)) {
- std::stringstream err_stream;
- err_stream << "Requested memory allocation of " << bytes << " bytes, larger than std::max "
- << "supported of " << MAX_ALLOCATION_BYTES << " bytes";
- return Status::InternalError(err_stream.str());
- }
- std::unique_ptr<Suballocation> free_node;
- bytes = std::max(bytes, MIN_ALLOCATION_BYTES);
- const int target_list_idx = ComputeListIndex(bytes);
- for (int i = target_list_idx; i < NUM_FREE_LISTS; ++i) {
- free_node = PopFreeListHead(i);
- if (free_node != nullptr) break;
- }
-
- if (free_node == nullptr) {
- // Unable to find free allocation, need to get more memory from buffer pool.
- RETURN_IF_ERROR(AllocateBuffer(bytes, &free_node));
- if (free_node == nullptr) {
- *result = nullptr;
- return Status::OK();
- }
- }
-
- // Free node may be larger than required.
- const int free_list_idx = ComputeListIndex(free_node->len_);
- if (free_list_idx != target_list_idx) {
- RETURN_IF_ERROR(SplitToSize(std::move(free_node), bytes, &free_node));
- DCHECK(free_node != nullptr);
- }
-
- free_node->in_use_ = true;
- allocated_ += free_node->len_;
- *result = std::move(free_node);
- return Status::OK();
-}
-
-int Suballocator::ComputeListIndex(int64_t bytes) const {
- return BitUtil::Log2CeilingNonZero64(bytes) - LOG_MIN_ALLOCATION_BYTES;
-}
-
-uint64_t Suballocator::ComputeAllocateBufferSize(int64_t bytes) const {
- bytes = std::max(bytes, MIN_ALLOCATION_BYTES);
- const int target_list_idx = ComputeListIndex(bytes);
- for (int i = target_list_idx; i < NUM_FREE_LISTS; ++i) {
- if (CheckFreeListHeadNotNull(i)) return 0;
- }
- return std::max(min_buffer_len_, BitUtil::RoundUpToPowerOfTwo(bytes));
-}
-
-Status Suballocator::AllocateBuffer(int64_t bytes, std::unique_ptr<Suballocation>* result) {
- DCHECK_LE(bytes, MAX_ALLOCATION_BYTES);
- const int64_t buffer_len = std::max(min_buffer_len_, BitUtil::RoundUpToPowerOfTwo(bytes));
-
- std::unique_ptr<Suballocation> free_node;
- RETURN_IF_ERROR(Suballocation::Create(&free_node));
- RETURN_IF_ERROR(pool_->AllocateBuffer(client_, buffer_len, &free_node->buffer_));
-
- free_node->data_ = free_node->buffer_.data();
- free_node->len_ = buffer_len;
- *result = std::move(free_node);
- return Status::OK();
-}
-
-Status Suballocator::SplitToSize(std::unique_ptr<Suballocation> free_node, int64_t target_bytes,
- std::unique_ptr<Suballocation>* result) {
- DCHECK(!free_node->in_use_);
- DCHECK_GT(free_node->len_, target_bytes);
-
- const int free_list_idx = ComputeListIndex(free_node->len_);
- const int target_list_idx = ComputeListIndex(target_bytes);
-
- // Preallocate nodes to avoid handling allocation failures during splitting.
- // Need two nodes per level for the left and right children.
- const int num_nodes = (free_list_idx - target_list_idx) * 2;
- constexpr int MAX_NUM_NODES = NUM_FREE_LISTS * 2;
- std::unique_ptr<Suballocation> nodes[MAX_NUM_NODES];
- for (int i = 0; i < num_nodes; ++i) {
- if (!Suballocation::Create(&nodes[i]).ok()) {
- // Add the free node to the free list to restore the allocator to an internally
- // consistent state.
- AddToFreeList(std::move(free_node));
- return Status::InternalError("Failed to allocate list node in Suballocator");
- }
- }
-
- // Iteratively split from the current size down to the target size. We will return
- // the leftmost descendant node.
- int next_node = 0;
- for (int i = free_list_idx; i > target_list_idx; --i) {
- DCHECK_EQ(free_node->len_, 1LL << (i + LOG_MIN_ALLOCATION_BYTES));
- std::unique_ptr<Suballocation> left_child = std::move(nodes[next_node++]);
- std::unique_ptr<Suballocation> right_child = std::move(nodes[next_node++]);
- DCHECK_LE(next_node, num_nodes);
-
- const int64_t child_len = free_node->len_ / 2;
- left_child->data_ = free_node->data_;
- right_child->data_ = free_node->data_ + child_len;
- left_child->len_ = right_child->len_ = child_len;
- left_child->buddy_ = right_child.get();
- right_child->buddy_ = left_child.get();
- free_node->in_use_ = true;
- left_child->parent_ = std::move(free_node);
-
- AddToFreeList(std::move(right_child));
- free_node = std::move(left_child);
- }
- *result = std::move(free_node);
- return Status::OK();
-}
-
-uint64_t Suballocator::Free(std::unique_ptr<Suballocation> allocation) {
- if (allocation == nullptr) return 0;
-
- DCHECK(allocation->in_use_);
- allocation->in_use_ = false;
- allocated_ -= allocation->len_;
-
- // Iteratively coalesce buddies until the buddy is in use or we get to the root.
- // This ensures that all buddies in the free lists are coalesced. I.e. we do not
- // have two buddies in the same free list.
- std::unique_ptr<Suballocation> curr_allocation = std::move(allocation);
- while (curr_allocation->buddy_ != nullptr) {
- if (curr_allocation->buddy_->in_use_) {
- // If the buddy is not free we can't coalesce, just add it to free list.
- AddToFreeList(std::move(curr_allocation));
- return 0;
- }
- std::unique_ptr<Suballocation> buddy = RemoveFromFreeList(curr_allocation->buddy_);
- curr_allocation = CoalesceBuddies(std::move(curr_allocation), std::move(buddy));
- }
-
- // Reached root, which is an entire free buffer. We are not using it, so free up memory.
- DCHECK(curr_allocation->buffer_.is_open());
- auto free_len = curr_allocation->buffer_.len();
- pool_->FreeBuffer(client_, &curr_allocation->buffer_);
- curr_allocation.reset();
- return free_len;
-}
-
-void Suballocator::AddToFreeList(std::unique_ptr<Suballocation> node) {
- DCHECK(!node->in_use_);
- int list_idx = ComputeListIndex(node->len_);
- if (free_lists_[list_idx] != nullptr) {
- free_lists_[list_idx]->prev_free_ = node.get();
- }
- node->next_free_ = std::move(free_lists_[list_idx]);
- DCHECK(node->prev_free_ == nullptr);
- free_lists_[list_idx] = std::move(node);
-}
-
-std::unique_ptr<Suballocation> Suballocator::RemoveFromFreeList(Suballocation* node) {
- DCHECK(node != nullptr);
- const int list_idx = ComputeListIndex(node->len_);
-
- if (node->next_free_ != nullptr) {
- node->next_free_->prev_free_ = node->prev_free_;
- }
-
- std::unique_ptr<Suballocation>* ptr_from_prev =
- node->prev_free_ == nullptr ? &free_lists_[list_idx] : &node->prev_free_->next_free_;
- node->prev_free_ = nullptr;
- std::unique_ptr<Suballocation> result = std::move(*ptr_from_prev);
- *ptr_from_prev = std::move(node->next_free_);
- return result;
-}
-
-std::unique_ptr<Suballocation> Suballocator::PopFreeListHead(int list_idx) {
- if (free_lists_[list_idx] == nullptr) return nullptr;
- std::unique_ptr<Suballocation> result = std::move(free_lists_[list_idx]);
- DCHECK(result->prev_free_ == nullptr);
- if (result->next_free_ != nullptr) {
- result->next_free_->prev_free_ = nullptr;
- }
- free_lists_[list_idx] = std::move(result->next_free_);
- return result;
-}
-
-bool Suballocator::CheckFreeListHeadNotNull(int list_idx) const {
- return free_lists_[list_idx] != nullptr;
-}
-
-std::unique_ptr<Suballocation> Suballocator::CoalesceBuddies(std::unique_ptr<Suballocation> b1,
- std::unique_ptr<Suballocation> b2) {
- DCHECK(!b1->in_use_);
- DCHECK(!b2->in_use_);
- DCHECK_EQ(b1->buddy_, b2.get());
- DCHECK_EQ(b2->buddy_, b1.get());
- // Only the left child's parent should be present.
- DCHECK((b1->parent_ != nullptr) ^ (b2->parent_ != nullptr));
- std::unique_ptr<Suballocation> parent =
- b1->parent_ != nullptr ? std::move(b1->parent_) : std::move(b2->parent_);
- parent->in_use_ = false;
- return parent;
-}
-
-Status Suballocation::Create(std::unique_ptr<Suballocation>* new_suballocation) {
- // Allocate from system allocator for simplicity. We don't expect this to be
- // performance critical or to be used for small allocations where CPU/memory
- // overhead of these allocations might be a consideration.
- new_suballocation->reset(new (std::nothrow) Suballocation());
- if (*new_suballocation == nullptr) {
- return Status::MemoryAllocFailed("allocate memory failed");
- }
- return Status::OK();
-}
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/suballocator.h b/be/src/runtime/bufferpool/suballocator.h
deleted file mode 100644
index eb5fd0d1fb..0000000000
--- a/be/src/runtime/bufferpool/suballocator.h
+++ /dev/null
@@ -1,221 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-
-#include "runtime/bufferpool/buffer_pool.h"
-
-namespace doris {
-
-class Suballocation;
-
-/// Helper class to subdivide buffers from the buffer pool. Implements a buddy
-/// allocation algorithm optimised for power-of-two allocations. At or above the
-/// 'min_buffer_len' value, each allocation is backed by a power-of-two buffer from
-/// a BufferPool. Below that threshold, each allocation is backed by a
-/// 'min_buffer_len' buffer split recursively into equal-sized buddies until the
-/// desired allocation size is reached. Every time an allocation is freed,
-/// free buddies are coalesced eagerly and whole buffers are freed eagerly.
-///
-/// The algorithms used are asymptotically efficient: O(log(max allocation size)), but
-/// the implementation's constant-factor overhead is not optimised. Thus, the allocator
-/// is best suited for relatively large allocations where the constant CPU/memory
-/// overhead per allocation is not paramount, e.g. bucket directories of hash tables.
-/// All allocations less than MIN_ALLOCATION_BYTES are rounded up to that amount.
-///
-/// Methods of Suballocator are not thread safe.
-///
-/// Implementation:
-/// ---------------
-/// The allocator uses two key data structures: a number of binary trees representing
-/// the buddy relationships between allocations and a set of free lists, one for each
-/// power-of-two size.
-///
-/// Each buffer allocated from the buffer pool has a tree of Suballocations associated
-/// with it that use the memory from that buffer. The root of the tree is the
-/// Suballocation corresponding to the entire buffer. Each node has either zero children
-/// (if it hasn't been split) or two children (if it has been split into two buddy
-/// allocations). Each non-root Suballocation has pointers to its buddy and its parent
-/// to enable coalescing the buddies into the parent when both are free.
-///
-/// Suballocations are eagerly coalesced when freed, so a Suballocation only has children
-/// if one of its descendants is allocated.
-///
-/// The free lists are doubly-linked lists of free Suballocation objects that support
-/// O(1) add and remove. The next and previous pointers are stored in the
-/// Suballocation object so no auxiliary memory is required.
-class Suballocator {
-public:
- /// Constructs a suballocator that allocates memory from 'pool' with 'client'.
- /// Suballocations smaller than 'min_buffer_len' are handled by allocating a
- /// buffer of 'min_buffer_len' and recursively splitting it.
- Suballocator(BufferPool* pool, BufferPool::ClientHandle* client, int64_t min_buffer_len);
-
- ~Suballocator();
- /// Compute how many mem will be allocated from BufferPool. We will use it to try
- /// consume mem in BufferedBlockMgr.
- uint64_t ComputeAllocateBufferSize(int64_t bytes) const;
- /// Allocate bytes from BufferPool. The allocation is nullptr if unsuccessful because
- /// the client's reservation was insufficient. If an unexpected error is encountered,
- /// returns that status. The allocation size is rounded up to the next power-of-two.
- /// The caller must always free the allocation by calling Free() (otherwise destructing
- /// the returned 'result' will DCHECK on debug builds or otherwise misbehave on release
- /// builds).
- ///
- /// Allocate() will try to increase the client's buffer pool reservation to fulfill
- /// the requested allocation if needed.
- ///
- /// The memory returned is at least 8-byte aligned.
- Status Allocate(int64_t bytes, std::unique_ptr<Suballocation>* result);
-
- /// Free the allocation. Does nothing if allocation is nullptr (e.g. was the result of a
- /// failed Allocate() call). Return how many really release in BufferPool, release mem in BufferedBlockMgr.
- uint64_t Free(std::unique_ptr<Suballocation> allocation);
-
- /// Upper bounds on the max allocation size and the number of different
- /// power-of-two allocation sizes. Used to bound the number of free lists.
- static constexpr int LOG_MAX_ALLOCATION_BYTES = BufferPool::LOG_MAX_BUFFER_BYTES;
- static constexpr int64_t MAX_ALLOCATION_BYTES = BufferPool::MAX_BUFFER_BYTES;
-
- /// Don't support allocations less than 4kb to avoid high overhead.
- static constexpr int LOG_MIN_ALLOCATION_BYTES = 12;
- static constexpr int64_t MIN_ALLOCATION_BYTES = 1L << LOG_MIN_ALLOCATION_BYTES;
-
-private:
- DISALLOW_COPY_AND_ASSIGN(Suballocator);
-
- /// Compute the index for allocations of size 'bytes' in 'free_lists_'. 'bytes' is
- /// rounded up to the next power-of-two if it is not already a power-of-two.
- int ComputeListIndex(int64_t bytes) const;
-
- /// Allocate a buffer of size 'bytes' < MAX_ALLOCATION_BYTES from the buffer pool and
- /// initialize 'result' with it. If the reservation is insufficient, try to increase
- /// the reservation to fit.
- Status AllocateBuffer(int64_t bytes, std::unique_ptr<Suballocation>* result);
-
- /// Split the free allocation until we get an allocation of 'target_bytes' rounded up
- /// to a power-of-two. This allocation is returned. The other allocations resulting
- /// from the splits are added to free lists. node->in_use must be false and 'node'
- /// must not be in any free list. Can fail if allocating memory for data structures
- /// fails.
- Status SplitToSize(std::unique_ptr<Suballocation> node, int64_t target_bytes,
- std::unique_ptr<Suballocation>* result);
-
- // Add allocation to the free list with given index.
- void AddToFreeList(std::unique_ptr<Suballocation> node);
-
- // Remove allocation from its free list.
- std::unique_ptr<Suballocation> RemoveFromFreeList(Suballocation* node);
-
- // Get the allocation at the head of the free list at index 'list_idx'. Return nullptr
- // if list is empty.
- std::unique_ptr<Suballocation> PopFreeListHead(int list_idx);
-
- // Check list_idx of Free List whether is nullptr
- bool CheckFreeListHeadNotNull(int list_idx) const;
-
- /// Coalesce two free buddies, 'b1' and 'b2'. Frees 'b1' and 'b2' and marks the parent
- /// not in use.
- std::unique_ptr<Suballocation> CoalesceBuddies(std::unique_ptr<Suballocation> b1,
- std::unique_ptr<Suballocation> b2);
-
- /// The pool and corresponding client to allocate buffers from.
- BufferPool* pool_;
- BufferPool::ClientHandle* client_;
-
- /// The minimum length of buffer to allocate. To serve allocations below this threshold,
- /// a larger buffer is allocated and split into multiple allocations.
- const int64_t min_buffer_len_;
-
- /// Track how much memory has been returned in allocations but not freed.
- int64_t allocated_;
-
- /// Free lists for each supported power-of-two size. Statically allocate the maximum
- /// possible number of lists for simplicity. Indexed by log2 of the allocation size
- /// minus log2 of the minimum allocation size, e.g. 16k allocations are at index 2.
- /// Each free list should only include one buddy of each pair: if both buddies are
- /// free, they should have been coalesced.
- ///
- /// Each free list is implemented as a doubly-linked list.
- static constexpr int NUM_FREE_LISTS = LOG_MAX_ALLOCATION_BYTES - LOG_MIN_ALLOCATION_BYTES + 1;
- std::unique_ptr<Suballocation> free_lists_[NUM_FREE_LISTS];
-};
-
-/// An allocation made by a Suballocator. Each allocation returned by Suballocator must
-/// be freed with Suballocator::Free().
-///
-/// Unique_ptr is used to manage ownership of these Suballocations as a guard against
-/// memory leaks. The owner of the unique_ptr is either:
-/// - client code, if the suballocation is in use
-/// - the free list array, if the suballocation is the head of a free list
-/// - the previous free list entry, if the suballocation is a subsequent free list entry
-/// - the suballocation's left child, if the suballocation is split
-class Suballocation {
-public:
- // Checks that the allocation is not in use (therefore not leaked).
- ~Suballocation() { DCHECK(!in_use_); }
-
- uint8_t* data() const { return data_; }
- int64_t len() const { return len_; }
-
-private:
- friend class Suballocator;
-
- DISALLOW_COPY_AND_ASSIGN(Suballocation);
-
- /// Static constructor for Suballocation. Can fail if new fails to allocate memory.
- static Status Create(std::unique_ptr<Suballocation>* new_suballocation);
-
- // The actual constructor - Create() is used for its better error handling.
- Suballocation()
- : data_(nullptr), len_(-1), buddy_(nullptr), prev_free_(nullptr), in_use_(false) {}
-
- /// The allocation's data and its length.
- uint8_t* data_;
- int64_t len_;
-
- /// The buffer backing the Suballocation, if the Suballocation is backed by an entire
- /// buffer. Otherwise uninitialized. 'buffer_' is open iff 'buddy_' is nullptr.
- BufferPool::BufferHandle buffer_;
-
- /// If this is a left child, the parent of this and its buddy. The parent's allocation
- /// is the contiguous memory buffer comprised of the two allocations. We store the
- /// parent in only the left child so that it is uniquely owned.
- std::unique_ptr<Suballocation> parent_;
-
- /// The buddy allocation of this allocation. The buddy's memory buffer is the same
- /// size and adjacent in memory. Two buddy Suballocation objects have the same
- /// lifetime: they are created in SplitToSize() and destroyed in CoalesceBuddies().
- Suballocation* buddy_;
-
- /// If this is in a free list, the next element in the list. nullptr if this is the last
- /// element in the free list. This pointer owns the next element in the linked list,
- /// which itself stores a raw back-pointer.
- std::unique_ptr<Suballocation> next_free_;
-
- /// If this is in a free list, the previous element in the list. nullptr if this is the
- /// first element. If non-nullptr, this Suballocation is owned by 'prev_free_'.
- Suballocation* prev_free_;
-
- /// True if was returned from Allocate() and hasn't been freed yet, or if it has been
- /// split into two child Suballocations.
- bool in_use_;
-};
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/system_allocator.cc b/be/src/runtime/bufferpool/system_allocator.cc
deleted file mode 100644
index cc5f7b7a27..0000000000
--- a/be/src/runtime/bufferpool/system_allocator.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/bufferpool/system_allocator.h"
-
-#include <gperftools/malloc_extension.h>
-#include <sys/mman.h>
-
-#include "common/config.h"
-#include "gutil/strings/substitute.h"
-#include "runtime/thread_context.h"
-#include "util/bit_util.h"
-#include "util/error_util.h"
-
-// TODO: IMPALA-5073: this should eventually become the default once we are confident
-// that it is superior to allocating via TCMalloc.
-//DEFINE_bool(mmap_buffers, false,
-// "(Experimental) If true, allocate buffers directly from the operating system "
-// "instead of with TCMalloc.");
-
-//DEFINE_bool(madvise_huge_pages, true,
-// "(Advanced) If true, advise operating system to back large memory buffers with huge "
-// "pages");
-
-namespace doris {
-
-/// These are the page sizes on x86-64. We could parse /proc/meminfo to programmatically
-/// get this, but it is unlikely to change unless we port to a different architecture.
-static int64_t SMALL_PAGE_SIZE = 4LL * 1024;
-static int64_t HUGE_PAGE_SIZE = 2LL * 1024 * 1024;
-
-SystemAllocator::SystemAllocator(int64_t min_buffer_len) : min_buffer_len_(min_buffer_len) {
- DCHECK(BitUtil::IsPowerOf2(min_buffer_len));
-}
-
-Status SystemAllocator::Allocate(int64_t len, BufferPool::BufferHandle* buffer) {
- DCHECK_GE(len, min_buffer_len_);
- DCHECK_LE(len, BufferPool::MAX_BUFFER_BYTES);
- DCHECK(BitUtil::IsPowerOf2(len)) << len;
-
- uint8_t* buffer_mem;
- if (config::mmap_buffers) {
- RETURN_IF_ERROR(AllocateViaMMap(len, &buffer_mem));
- } else {
- RETURN_IF_ERROR(AllocateViaMalloc(len, &buffer_mem));
- }
- buffer->Open(buffer_mem, len, CpuInfo::get_current_core());
- return Status::OK();
-}
-
-Status SystemAllocator::AllocateViaMMap(int64_t len, uint8_t** buffer_mem) {
- int64_t map_len = len;
- bool use_huge_pages = len % HUGE_PAGE_SIZE == 0 && config::madvise_huge_pages;
- if (use_huge_pages) {
- // Map an extra huge page so we can fix up the alignment if needed.
- map_len += HUGE_PAGE_SIZE;
- }
- CONSUME_THREAD_MEM_TRACKER(map_len);
- uint8_t* mem = reinterpret_cast<uint8_t*>(
- mmap(nullptr, map_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
- if (mem == MAP_FAILED) {
- RELEASE_THREAD_MEM_TRACKER(map_len);
- return Status::BufferAllocFailed("mmap failed");
- }
-
- if (use_huge_pages) {
- // mmap() may return memory that is not aligned to the huge page size. For the
- // subsequent madvise() call to work well, we need to align it ourselves and
- // unmap the memory on either side of the buffer that we don't need.
- uintptr_t misalignment = reinterpret_cast<uintptr_t>(mem) % HUGE_PAGE_SIZE;
- if (misalignment != 0) {
- uintptr_t fixup = HUGE_PAGE_SIZE - misalignment;
- munmap(mem, fixup);
- RELEASE_THREAD_MEM_TRACKER(fixup);
- mem += fixup;
- map_len -= fixup;
- }
- munmap(mem + len, map_len - len);
- RELEASE_THREAD_MEM_TRACKER(map_len - len);
- DCHECK_EQ(reinterpret_cast<uintptr_t>(mem) % HUGE_PAGE_SIZE, 0) << mem;
- // Mark the buffer as a candidate for promotion to huge pages. The Linux Transparent
- // Huge Pages implementation will try to back the memory with a huge page if it is
- // enabled. MADV_HUGEPAGE was introduced in 2.6.38, so we similarly need to skip this
- // code if we are compiling against an older kernel.
-#ifdef MADV_HUGEPAGE
- int rc;
- // According to madvise() docs it may return EAGAIN to signal that we should retry.
- do {
- rc = madvise(mem, len, MADV_HUGEPAGE);
- } while (rc == -1 && errno == EAGAIN);
- DCHECK(rc == 0) << "madvise(MADV_HUGEPAGE) shouldn't fail" << errno;
-#endif
- }
- *buffer_mem = mem;
- return Status::OK();
-}
-
-Status SystemAllocator::AllocateViaMalloc(int64_t len, uint8_t** buffer_mem) {
- bool use_huge_pages = len % HUGE_PAGE_SIZE == 0 && config::madvise_huge_pages;
- // Allocate, aligned to the page size that we expect to back the memory range.
- // This ensures that it can be backed by a whole pages, rather than parts of pages.
- size_t alignment = use_huge_pages ? HUGE_PAGE_SIZE : SMALL_PAGE_SIZE;
- int rc = posix_memalign(reinterpret_cast<void**>(buffer_mem), alignment, len);
-#if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER)
- // Workaround ASAN bug where posix_memalign returns 0 even when allocation fails.
- // It should instead return ENOMEM. See https://bugs.llvm.org/show_bug.cgi?id=32968.
- if (rc == 0 && *buffer_mem == nullptr && len != 0) rc = ENOMEM;
-#endif
- if (rc != 0) {
- return Status::InternalError("posix_memalign() failed to allocate buffer: {}",
- get_str_err_msg());
- }
- if (use_huge_pages) {
-#ifdef MADV_HUGEPAGE
- // According to madvise() docs it may return EAGAIN to signal that we should retry.
- do {
- rc = madvise(*buffer_mem, len, MADV_HUGEPAGE);
- } while (rc == -1 && errno == EAGAIN);
- DCHECK(rc == 0) << "madvise(MADV_HUGEPAGE) shouldn't fail" << errno;
-#endif
- }
- return Status::OK();
-}
-
-void SystemAllocator::Free(BufferPool::BufferHandle&& buffer) {
- if (config::mmap_buffers) {
- int rc = munmap(buffer.data(), buffer.len());
- RELEASE_THREAD_MEM_TRACKER(buffer.len());
- DCHECK_EQ(rc, 0) << "Unexpected munmap() error: " << errno;
- } else {
- bool use_huge_pages = buffer.len() % HUGE_PAGE_SIZE == 0 && config::madvise_huge_pages;
- if (use_huge_pages) {
- // Undo the madvise so that is isn't a candidate to be newly backed by huge pages.
- // We depend on TCMalloc's "aggressive decommit" mode decommitting the physical
- // huge pages with madvise(DONTNEED) when we call free(). Otherwise, this huge
- // page region may be divvied up and subsequently decommitted in smaller chunks,
- // which may not actually release the physical memory, causing Impala physical
- // memory usage to exceed the process limit.
-#ifdef MADV_NOHUGEPAGE
- // According to madvise() docs it may return EAGAIN to signal that we should retry.
- int rc;
- do {
- rc = madvise(buffer.data(), buffer.len(), MADV_NOHUGEPAGE);
- } while (rc == -1 && errno == EAGAIN);
- DCHECK(rc == 0) << "madvise(MADV_NOHUGEPAGE) shouldn't fail" << errno;
-#endif
- }
- free(buffer.data());
- }
- buffer.Reset(); // Avoid DCHECK in ~BufferHandle().
-}
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/system_allocator.h b/be/src/runtime/bufferpool/system_allocator.h
deleted file mode 100644
index 83c3a4507b..0000000000
--- a/be/src/runtime/bufferpool/system_allocator.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "common/status.h"
-#include "runtime/bufferpool/buffer_pool.h"
-
-namespace doris {
-
-/// The underlying memory allocator for the buffer pool that allocates buffer memory from
-/// the operating system using mmap(). All buffers are allocated through the BufferPool's
-/// SystemAllocator. The allocator only handles allocating buffers that are power-of-two
-/// multiples of the minimum buffer length.
-class SystemAllocator {
-public:
- SystemAllocator(int64_t min_buffer_len);
-
- /// Allocate memory for a buffer of 'len' bytes. 'len' must be a power-of-two multiple
- /// of the minimum buffer length.
- Status Allocate(int64_t len, BufferPool::BufferHandle* buffer) WARN_UNUSED_RESULT;
-
- /// Free the memory for a previously-allocated buffer.
- void Free(BufferPool::BufferHandle&& buffer);
-
-private:
- /// Allocate 'len' bytes of memory for a buffer via mmap().
- Status AllocateViaMMap(int64_t len, uint8_t** buffer_mem);
-
- /// Allocate 'len' bytes of memory for a buffer via our malloc implementation.
- Status AllocateViaMalloc(int64_t len, uint8_t** buffer_mem);
-
- const int64_t min_buffer_len_;
-};
-} // namespace doris
diff --git a/be/src/runtime/disk_io_mgr.cc b/be/src/runtime/disk_io_mgr.cc
deleted file mode 100644
index 702ee127a2..0000000000
--- a/be/src/runtime/disk_io_mgr.cc
+++ /dev/null
@@ -1,1195 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/disk-io-mgr.cc
-// and modified by Doris
-
-#include "runtime/disk_io_mgr.h"
-
-#include <boost/algorithm/string.hpp>
-
-#include "runtime/disk_io_mgr_internal.h"
-#include "runtime/exec_env.h"
-#include "runtime/thread_context.h"
-
-using std::string;
-using std::stringstream;
-using std::vector;
-using std::list;
-using std::endl;
-
-using std::lock_guard;
-using std::unique_lock;
-using std::mutex;
-using std::thread;
-
-// Returns the ceil of value/divisor
-static int64_t bit_ceil(int64_t value, int64_t divisor) {
- return value / divisor + (value % divisor != 0);
-}
-
-// Returns ceil(log2(x)).
-// TODO: this could be faster if we use __builtin_clz. Fix this if this ever shows up
-// in a hot path.
-static int bit_log2(uint64_t x) {
- DCHECK_GT(x, 0);
- if (x == 1) {
- return 0;
- }
- // Compute result = ceil(log2(x))
- // = floor(log2(x - 1)) + 1, for x > 1
- // by finding the position of the most significant bit (1-indexed) of x - 1
- // (floor(log2(n)) = MSB(n) (0-indexed))
- --x;
- int result = 1;
- while (x >>= 1) {
- ++result;
- }
- return result;
-}
-
-namespace doris {
-
-// Rotational disks should have 1 thread per disk to minimize seeks. Non-rotational
-// don't have this penalty and benefit from multiple concurrent IO requests.
-static const int THREADS_PER_ROTATIONAL_DISK = 1;
-static const int THREADS_PER_FLASH_DISK = 8;
-
-// The IoMgr is able to run with a wide range of memory usage. If a query has memory
-// remaining less than this value, the IoMgr will stop all buffering regardless of the
-// current queue size.
-static const int LOW_MEMORY = 64 * 1024 * 1024;
-
-const int DiskIoMgr::DEFAULT_QUEUE_CAPACITY = 2;
-
-// namespace detail {
-// Indicates if file handle caching should be used
-// static inline bool is_file_handle_caching_enabled() {
-// return config::max_cached_file_handles > 0;
-// }
-// }
-
-// This method is used to clean up resources upon eviction of a cache file handle.
-// void DiskIoMgr::HdfsCachedFileHandle::release(DiskIoMgr::HdfsCachedFileHandle** h) {
-// VLOG_FILE << "Cached file handle evicted, hdfsCloseFile() fid=" << (*h)->_hdfs_file;
-// delete (*h);
-// }
-
-// DiskIoMgr::HdfsCachedFileHandle::HdfsCachedFileHandle(const hdfsFS& fs, const char* fname,
-// int64_t mtime)
-// : _fs(fs), _hdfs_file(hdfsOpenFile(fs, fname, O_RDONLY, 0, 0, 0)), _mtime(mtime) {
-// VLOG_FILE << "hdfsOpenFile() file=" << fname << " fid=" << _hdfs_file;
-// }
-
-// DiskIoMgr::HdfsCachedFileHandle::~HdfsCachedFileHandle() {
-// if (_hdfs_file != nullptr && _fs != nullptr) {
-// VLOG_FILE << "hdfsCloseFile() fid=" << _hdfs_file;
-// hdfsCloseFile(_fs, _hdfs_file);
-// }
-// _fs = nullptr;
-// _hdfs_file = nullptr;
-// }
-
-// This class provides a cache of RequestContext objects. RequestContexts are recycled.
-// This is good for locality as well as lock contention. The cache has the property that
-// regardless of how many clients get added/removed, the memory locations for
-// existing clients do not change (not the case with std::vector) minimizing the locks we
-// have to take across all readers.
-// All functions on this object are thread safe
-class DiskIoMgr::RequestContextCache {
-public:
- RequestContextCache(DiskIoMgr* io_mgr) : _io_mgr(io_mgr) {}
-
- // Returns a context to the cache. This object can now be reused.
- void return_context(RequestContext* reader) {
- DCHECK(reader->_state != RequestContext::Inactive);
- reader->_state = RequestContext::Inactive;
- lock_guard<mutex> l(_lock);
- _inactive_contexts.push_back(reader);
- }
-
- // Returns a new RequestContext object. Allocates a new object if necessary.
- RequestContext* get_new_context() {
- lock_guard<mutex> l(_lock);
- if (!_inactive_contexts.empty()) {
- RequestContext* reader = _inactive_contexts.front();
- _inactive_contexts.pop_front();
- return reader;
- } else {
- RequestContext* reader = new RequestContext(_io_mgr, _io_mgr->num_total_disks());
- _all_contexts.push_back(reader);
- return reader;
- }
- }
-
- // This object has the same lifetime as the disk IoMgr.
- ~RequestContextCache() {
- for (list<RequestContext*>::iterator it = _all_contexts.begin(); it != _all_contexts.end();
- ++it) {
- delete *it;
- }
- }
-
- // Validates that all readers are cleaned up and in the inactive state.
- bool validate_all_inactive() {
- lock_guard<mutex> l(_lock);
- for (list<RequestContext*>::iterator it = _all_contexts.begin(); it != _all_contexts.end();
- ++it) {
- if ((*it)->_state != RequestContext::Inactive) {
- return false;
- }
- }
- DCHECK_EQ(_all_contexts.size(), _inactive_contexts.size());
- return _all_contexts.size() == _inactive_contexts.size();
- }
-
- string debug_string();
-
-private:
- DiskIoMgr* _io_mgr;
-
- // lock to protect all members below
- mutex _lock;
-
- // List of all request contexts created. Used for debugging
- list<RequestContext*> _all_contexts;
-
- // List of inactive readers. These objects can be used for a new reader.
- list<RequestContext*> _inactive_contexts;
-};
-
-string DiskIoMgr::RequestContextCache::debug_string() {
- lock_guard<mutex> l(_lock);
- stringstream ss;
- for (list<RequestContext*>::iterator it = _all_contexts.begin(); it != _all_contexts.end();
- ++it) {
- unique_lock<mutex> lock((*it)->_lock);
- ss << (*it)->debug_string() << endl;
- }
- return ss.str();
-}
-
-string DiskIoMgr::debug_string() {
- stringstream ss;
- ss << "RequestContexts: " << endl << _request_context_cache->debug_string() << endl;
-
- ss << "Disks: " << endl;
- for (int i = 0; i < _disk_queues.size(); ++i) {
- unique_lock<mutex> lock(_disk_queues[i]->lock);
- ss << " " << (void*)_disk_queues[i] << ":";
- if (!_disk_queues[i]->request_contexts.empty()) {
- ss << " Readers: ";
- for (RequestContext* req_context : _disk_queues[i]->request_contexts) {
- ss << (void*)req_context;
- }
- }
- ss << endl;
- }
- return ss.str();
-}
-
-DiskIoMgr::BufferDescriptor::BufferDescriptor(DiskIoMgr* io_mgr)
- : _io_mgr(io_mgr), _reader(nullptr), _buffer(nullptr) {}
-
-void DiskIoMgr::BufferDescriptor::reset(RequestContext* reader, ScanRange* range, char* buffer,
- int64_t buffer_len) {
- DCHECK(_io_mgr != nullptr);
- DCHECK(_buffer == nullptr);
- DCHECK(range != nullptr);
- DCHECK(buffer != nullptr);
- DCHECK_GE(buffer_len, 0);
- _reader = reader;
- _scan_range = range;
- _buffer = buffer;
- _buffer_len = buffer_len;
- _len = 0;
- _eosr = false;
- _status = Status::OK();
-}
-
-void DiskIoMgr::BufferDescriptor::return_buffer() {
- DCHECK(_io_mgr != nullptr);
- _io_mgr->return_buffer(this);
-}
-
-DiskIoMgr::WriteRange::WriteRange(const string& file, int64_t file_offset, int disk_id,
- WriteDoneCallback callback) {
- _file = file;
- _offset = file_offset;
- _disk_id = disk_id;
- _callback = callback;
- _request_type = RequestType::WRITE;
-}
-
-void DiskIoMgr::WriteRange::set_data(const uint8_t* buffer, int64_t len) {
- _data = buffer;
- _len = len;
-}
-
-static void check_sse_support() {
- if (!CpuInfo::is_supported(CpuInfo::SSE4_2)) {
- LOG(WARNING) << "This machine does not support sse4_2. The default IO system "
- "configurations are suboptimal for this hardware. Consider "
- "increasing the number of threads per disk by restarting doris "
- "using the --num_threads_per_disk flag with a higher value";
- }
-}
-
-DiskIoMgr::DiskIoMgr()
- : _num_threads_per_disk(config::num_threads_per_disk),
- _max_buffer_size(config::read_size),
- _min_buffer_size(config::min_buffer_size),
- _cached_read_options(nullptr),
- _shut_down(false),
- _total_bytes_read_counter(TUnit::BYTES),
- _read_timer(TUnit::TIME_NS)
-// _read_timer(TUnit::TIME_NS),
-// _file_handle_cache(
-// std::min((uint64_t)config::max_cached_file_handles, FileSystemUtil::max_num_file_handles()),
-// &HdfsCachedFileHandle::release) {
-{
- int64_t max_buffer_size_scaled = bit_ceil(_max_buffer_size, _min_buffer_size);
- _free_buffers.resize(bit_log2(max_buffer_size_scaled) + 1);
- int num_local_disks = (config::num_disks == 0 ? DiskInfo::num_disks() : config::num_disks);
- _disk_queues.resize(num_local_disks + REMOTE_NUM_DISKS);
- check_sse_support();
-}
-
-DiskIoMgr::DiskIoMgr(int num_local_disks, int threads_per_disk, int min_buffer_size,
- int max_buffer_size)
- : _num_threads_per_disk(threads_per_disk),
- _max_buffer_size(max_buffer_size),
- _min_buffer_size(min_buffer_size),
- _cached_read_options(nullptr),
- _shut_down(false),
- _total_bytes_read_counter(TUnit::BYTES),
- _read_timer(TUnit::TIME_NS)
-// _read_timer(TUnit::TIME_NS),
-// _file_handle_cache(::min(config::max_cached_file_handles,
-// FileSystemUtil::max_num_file_handles()), &HdfsCachedFileHandle::release) {
-{
- int64_t max_buffer_size_scaled = bit_ceil(_max_buffer_size, _min_buffer_size);
- _free_buffers.resize(bit_log2(max_buffer_size_scaled) + 1);
- if (num_local_disks == 0) {
- num_local_disks = DiskInfo::num_disks();
- }
- _disk_queues.resize(num_local_disks + REMOTE_NUM_DISKS);
- check_sse_support();
-}
-
-DiskIoMgr::~DiskIoMgr() {
- _shut_down = true;
- // Notify all worker threads and shut them down.
- for (int i = 0; i < _disk_queues.size(); ++i) {
- if (_disk_queues[i] == nullptr) {
- continue;
- }
- {
- // This lock is necessary to properly use the condition var to notify
- // the disk worker threads. The readers also grab this lock so updates
- // to _shut_down are protected.
- unique_lock<mutex> disk_lock(_disk_queues[i]->lock);
- }
- _disk_queues[i]->work_available.notify_all();
- }
- _disk_thread_group.join_all();
-
- for (int i = 0; i < _disk_queues.size(); ++i) {
- if (_disk_queues[i] == nullptr) {
- continue;
- }
- int disk_id = _disk_queues[i]->disk_id;
- for (list<RequestContext*>::iterator it = _disk_queues[i]->request_contexts.begin();
- it != _disk_queues[i]->request_contexts.end(); ++it) {
- DCHECK_EQ((*it)->_disk_states[disk_id].num_threads_in_op(), 0);
- DCHECK((*it)->_disk_states[disk_id].done());
- (*it)->decrement_disk_ref_count();
- }
- }
-
- DCHECK(_request_context_cache.get() == nullptr ||
- _request_context_cache->validate_all_inactive())
- << endl
- << debug_string();
- DCHECK_EQ(_num_buffers_in_readers, 0);
-
- // Delete all allocated buffers
- int num_free_buffers = 0;
- for (int idx = 0; idx < _free_buffers.size(); ++idx) {
- num_free_buffers += _free_buffers[idx].size();
- }
- DCHECK_EQ(_num_allocated_buffers, num_free_buffers);
- gc_io_buffers();
-
- for (int i = 0; i < _disk_queues.size(); ++i) {
- delete _disk_queues[i];
- }
-
- /*
- * if (_cached_read_options != nullptr) {
- * hadoopRzOptionsFree(_cached_read_options);
- * }
- */
-}
-
-Status DiskIoMgr::init(const int64_t mem_limit) {
- _mem_tracker = std::make_unique<MemTrackerLimiter>(MemTrackerLimiter::Type::GLOBAL, "DiskIO",
- mem_limit);
-
- for (int i = 0; i < _disk_queues.size(); ++i) {
- _disk_queues[i] = new DiskQueue(i);
- int num_threads_per_disk = 0;
- if (i >= num_local_disks()) {
- // remote disks, do nothing
- continue;
- } else if (_num_threads_per_disk != 0) {
- num_threads_per_disk = _num_threads_per_disk;
- } else if (DiskInfo::is_rotational(i)) {
- num_threads_per_disk = THREADS_PER_ROTATIONAL_DISK;
- } else {
- num_threads_per_disk = THREADS_PER_FLASH_DISK;
- }
- for (int j = 0; j < num_threads_per_disk; ++j) {
- stringstream ss;
- ss << "work-loop(Disk: " << i << ", Thread: " << j << ")";
- // _disk_thread_group.AddThread(new Thread("disk-io-mgr", ss.str(),
- // &DiskIoMgr::work_loop, this, _disk_queues[i]));
- _disk_thread_group.add_thread(
- new std::thread(std::bind(&DiskIoMgr::work_loop, this, _disk_queues[i])));
- }
- }
- _request_context_cache.reset(new RequestContextCache(this));
-
- // _cached_read_options = hadoopRzOptionsAlloc();
- // DCHECK(_cached_read_options != nullptr);
- // Disable checksum for cached reads.
- // int ret = hadoopRzOptionsSetSkipChecksum(_cached_read_options, true);
- // DCHECK_EQ(ret, 0);
- // Disable automatic fallback for cached reads.
- // ret = hadoopRzOptionsSetByteBufferPool(_cached_read_options, nullptr);
- // DCHECK_EQ(ret, 0);
-
- return Status::OK();
-}
-
-Status DiskIoMgr::register_context(RequestContext** request_context) {
- DCHECK(_request_context_cache) << "Must call init() first.";
- *request_context = _request_context_cache->get_new_context();
- (*request_context)->reset();
- return Status::OK();
-}
-
-void DiskIoMgr::unregister_context(RequestContext* reader) {
- // Blocking cancel (waiting for disks completion).
- cancel_context(reader, true);
-
- // All the disks are done with clean, validate nothing is leaking.
- unique_lock<mutex> reader_lock(reader->_lock);
- DCHECK_EQ(reader->_num_buffers_in_reader, 0) << endl << reader->debug_string();
- DCHECK_EQ(reader->_num_used_buffers, 0) << endl << reader->debug_string();
-
- DCHECK(reader->validate()) << endl << reader->debug_string();
- _request_context_cache->return_context(reader);
-}
-
-// Cancellation requires coordination from multiple threads. Each thread that currently
-// has a reference to the request context must notice the cancel and remove it from its
-// tracking structures. The last thread to touch the context should deallocate (aka
-// recycle) the request context object. Potential threads are:
-// 1. Disk threads that are currently reading for this reader.
-// 2. Caller threads that are waiting in get_next.
-//
-// The steps are:
-// 1. Cancel will immediately set the context in the Cancelled state. This prevents any
-// other thread from adding more ready buffers to the context (they all take a lock and
-// check the state before doing so), or any write ranges to the context.
-// 2. Cancel will call cancel on each ScanRange that is not yet complete, unblocking
-// any threads in get_next(). The reader will see the cancelled Status returned. Cancel
-// also invokes the callback for the WriteRanges with the cancelled state.
-// 3. Disk threads notice the context is cancelled either when picking the next context
-// to process or when they try to enqueue a ready buffer. Upon noticing the cancelled
-// state, removes the context from the disk queue. The last thread per disk with an
-// outstanding reference to the context decrements the number of disk queues the context
-// is on.
-// If wait_for_disks_completion is true, wait for the number of active disks to become 0.
-void DiskIoMgr::cancel_context(RequestContext* context, bool wait_for_disks_completion) {
- context->cancel(Status::Cancelled("Cancelled"));
-
- if (wait_for_disks_completion) {
- unique_lock<mutex> lock(context->_lock);
- DCHECK(context->validate()) << endl << context->debug_string();
- while (context->_num_disks_with_ranges > 0) {
- context->_disks_complete_cond_var.wait(lock);
- }
- }
-}
-
-void DiskIoMgr::set_read_timer(RequestContext* r, RuntimeProfile::Counter* c) {
- r->_read_timer = c;
-}
-
-void DiskIoMgr::set_bytes_read_counter(RequestContext* r, RuntimeProfile::Counter* c) {
- r->_bytes_read_counter = c;
-}
-
-void DiskIoMgr::set_active_read_thread_counter(RequestContext* r, RuntimeProfile::Counter* c) {
- r->_active_read_thread_counter = c;
-}
-
-void DiskIoMgr::set_disks_access_bitmap(RequestContext* r, RuntimeProfile::Counter* c) {
- r->_disks_accessed_bitmap = c;
-}
-
-int64_t DiskIoMgr::queue_size(RequestContext* reader) const {
- return reader->_num_ready_buffers;
-}
-
-Status DiskIoMgr::context_status(RequestContext* context) const {
- unique_lock<mutex> lock(context->_lock);
- return context->_status;
-}
-
-int DiskIoMgr::num_unstarted_ranges(RequestContext* reader) const {
- return reader->_num_unstarted_scan_ranges;
-}
-
-int64_t DiskIoMgr::bytes_read_local(RequestContext* reader) const {
- return reader->_bytes_read_local;
-}
-
-int64_t DiskIoMgr::bytes_read_short_circuit(RequestContext* reader) const {
- return reader->_bytes_read_short_circuit;
-}
-
-int64_t DiskIoMgr::bytes_read_dn_cache(RequestContext* reader) const {
- return reader->_bytes_read_dn_cache;
-}
-
-int DiskIoMgr::num_remote_ranges(RequestContext* reader) const {
- return reader->_num_remote_ranges;
-}
-
-int64_t DiskIoMgr::unexpected_remote_bytes(RequestContext* reader) const {
- return reader->_unexpected_remote_bytes;
-}
-
-int64_t DiskIoMgr::get_read_throughput() {
- return RuntimeProfile::units_per_second(&_total_bytes_read_counter, &_read_timer);
-}
-
-Status DiskIoMgr::validate_scan_range(ScanRange* range) {
- int disk_id = range->_disk_id;
- if (disk_id < 0 || disk_id >= _disk_queues.size()) {
- stringstream ss;
- ss << "Invalid scan range. Bad disk id: " << disk_id;
- DCHECK(false) << ss.str();
- return Status::InternalError(ss.str());
- }
- return Status::OK();
-}
-
-Status DiskIoMgr::add_scan_ranges(RequestContext* reader, const vector<ScanRange*>& ranges,
- bool schedule_immediately) {
- if (ranges.empty()) {
- return Status::OK();
- }
-
- // Validate and initialize all ranges
- for (int i = 0; i < ranges.size(); ++i) {
- RETURN_IF_ERROR(validate_scan_range(ranges[i]));
- ranges[i]->init_internal(this, reader);
- }
-
- // disks that this reader needs to be scheduled on.
- unique_lock<mutex> reader_lock(reader->_lock);
- DCHECK(reader->validate()) << endl << reader->debug_string();
-
- if (reader->_state == RequestContext::Cancelled) {
- DCHECK(!reader->_status.ok());
- return reader->_status;
- }
-
- // Add each range to the queue of the disk the range is on
- for (int i = 0; i < ranges.size(); ++i) {
- // Don't add empty ranges.
- DCHECK_NE(ranges[i]->len(), 0);
- ScanRange* range = ranges[i];
-
- /*
- * if (range->_try_cache) {
- * if (schedule_immediately) {
- * bool cached_read_succeeded;
- * RETURN_IF_ERROR(range->read_from_cache(&cached_read_succeeded));
- * if (cached_read_succeeded) continue;
- * // Cached read failed, fall back to add_request_range() below.
- * } else {
- * reader->_cached_ranges.enqueue(range);
- * continue;
- * }
- * }
- */
- reader->add_request_range(range, schedule_immediately);
- }
- DCHECK(reader->validate()) << endl << reader->debug_string();
-
- return Status::OK();
-}
-
-// This function returns the next scan range the reader should work on, checking
-// for eos and error cases. If there isn't already a cached scan range or a scan
-// range prepared by the disk threads, the caller waits on the disk threads.
-Status DiskIoMgr::get_next_range(RequestContext* reader, ScanRange** range) {
- DCHECK(reader != nullptr);
- DCHECK(range != nullptr);
- *range = nullptr;
- Status status = Status::OK();
-
- unique_lock<mutex> reader_lock(reader->_lock);
- DCHECK(reader->validate()) << endl << reader->debug_string();
-
- while (true) {
- if (reader->_state == RequestContext::Cancelled) {
- DCHECK(!reader->_status.ok());
- status = reader->_status;
- break;
- }
-
- if (reader->_num_unstarted_scan_ranges == 0 && reader->_ready_to_start_ranges.empty() &&
- reader->_cached_ranges.empty()) {
- // All ranges are done, just return.
- break;
- }
-
- // if (!reader->_cached_ranges.empty()) {
- // // We have a cached range.
- // *range = reader->_cached_ranges.dequeue();
- // DCHECK((*range)->_try_cache);
- // // bool cached_read_succeeded;
- // // RETURN_IF_ERROR((*range)->read_from_cache(&cached_read_succeeded));
- // // if (cached_read_succeeded) return Status::OK();
-
- // // This range ended up not being cached. Loop again and pick up a new range.
- // reader->add_request_range(*range, false);
- // DCHECK(reader->validate()) << endl << reader->debug_string();
- // *range = nullptr;
- // continue;
- // }
-
- if (reader->_ready_to_start_ranges.empty()) {
- reader->_ready_to_start_ranges_cv.wait(reader_lock);
- } else {
- *range = reader->_ready_to_start_ranges.dequeue();
- DCHECK(*range != nullptr);
- int disk_id = (*range)->disk_id();
- DCHECK_EQ(*range, reader->_disk_states[disk_id].next_scan_range_to_start());
- // Set this to nullptr, the next time this disk runs for this reader, it will
- // get another range ready.
- reader->_disk_states[disk_id].set_next_scan_range_to_start(nullptr);
- reader->schedule_scan_range(*range);
- break;
- }
- }
- return status;
-}
-
-Status DiskIoMgr::read(RequestContext* reader, ScanRange* range, BufferDescriptor** buffer) {
- DCHECK(range != nullptr);
- DCHECK(buffer != nullptr);
- *buffer = nullptr;
-
- if (range->len() > _max_buffer_size) {
- return Status::InternalError("Cannot perform sync read larger than {}. Request was {}",
- _max_buffer_size, range->len());
- }
-
- vector<DiskIoMgr::ScanRange*> ranges;
- ranges.push_back(range);
- RETURN_IF_ERROR(add_scan_ranges(reader, ranges, true));
- RETURN_IF_ERROR(range->get_next(buffer));
- DCHECK((*buffer) != nullptr);
- DCHECK((*buffer)->eosr());
- return Status::OK();
-}
-
-void DiskIoMgr::return_buffer(BufferDescriptor* buffer_desc) {
- DCHECK(buffer_desc != nullptr);
- if (!buffer_desc->_status.ok()) {
- DCHECK(buffer_desc->_buffer == nullptr);
- }
-
- RequestContext* reader = buffer_desc->_reader;
- if (buffer_desc->_buffer != nullptr) {
- if (buffer_desc->_scan_range->_cached_buffer == nullptr) {
- // Not a cached buffer. Return the io buffer and update mem tracking.
- return_free_buffer(buffer_desc);
- }
- buffer_desc->_buffer = nullptr;
- --_num_buffers_in_readers;
- --reader->_num_buffers_in_reader;
- } else {
- // A nullptr buffer means there was an error in which case there is no buffer
- // to return.
- }
-
- if (buffer_desc->_eosr || buffer_desc->_scan_range->_is_cancelled) {
- // Need to close the scan range if returning the last buffer or the scan range
- // has been cancelled (and the caller might never get the last buffer).
- // close() is idempotent so multiple cancelled buffers is okay.
- buffer_desc->_scan_range->close();
- }
- return_buffer_desc(buffer_desc);
-}
-
-void DiskIoMgr::return_buffer_desc(BufferDescriptor* desc) {
- DCHECK(desc != nullptr);
- unique_lock<mutex> lock(_free_buffers_lock);
- DCHECK(find(_free_buffer_descs.begin(), _free_buffer_descs.end(), desc) ==
- _free_buffer_descs.end());
- _free_buffer_descs.push_back(desc);
-}
-
-DiskIoMgr::BufferDescriptor* DiskIoMgr::get_buffer_desc(RequestContext* reader, ScanRange* range,
- char* buffer, int64_t buffer_size) {
- BufferDescriptor* buffer_desc = nullptr;
- {
- unique_lock<mutex> lock(_free_buffers_lock);
- if (_free_buffer_descs.empty()) {
- buffer_desc = _pool.add(new BufferDescriptor(this));
- } else {
- buffer_desc = _free_buffer_descs.front();
- _free_buffer_descs.pop_front();
- }
- }
- buffer_desc->reset(reader, range, buffer, buffer_size);
- return buffer_desc;
-}
-
-char* DiskIoMgr::get_free_buffer(int64_t* buffer_size) {
- DCHECK_LE(*buffer_size, _max_buffer_size);
- DCHECK_GT(*buffer_size, 0);
- *buffer_size = std::min(static_cast<int64_t>(_max_buffer_size), *buffer_size);
- int idx = free_buffers_idx(*buffer_size);
- // Quantize buffer size to nearest power of 2 greater than the specified buffer size and
- // convert to bytes
- *buffer_size = (1 << idx) * _min_buffer_size;
-
- unique_lock<mutex> lock(_free_buffers_lock);
- char* buffer = nullptr;
- if (_free_buffers[idx].empty()) {
- ++_num_allocated_buffers;
- buffer = new char[*buffer_size];
- } else {
- // This means the buffer's memory ownership is transferred from DiskIoMgr to tls tracker.
- THREAD_MEM_TRACKER_TRANSFER_FROM(*buffer_size, _mem_tracker.get());
- buffer = _free_buffers[idx].front();
- _free_buffers[idx].pop_front();
- }
- DCHECK(buffer != nullptr);
- return buffer;
-}
-
-void DiskIoMgr::gc_io_buffers(int64_t bytes_to_free) {
- unique_lock<mutex> lock(_free_buffers_lock);
- int bytes_freed = 0;
- for (int idx = 0; idx < _free_buffers.size(); ++idx) {
- for (list<char*>::iterator iter = _free_buffers[idx].begin();
- iter != _free_buffers[idx].end(); ++iter) {
- int64_t buffer_size = (1 << idx) * _min_buffer_size;
- --_num_allocated_buffers;
- delete[] * iter;
-
- bytes_freed += buffer_size;
- }
- _free_buffers[idx].clear();
- if (bytes_freed >= bytes_to_free) {
- break;
- }
- }
- // The deleted buffer is released in the tls mem tracker, the deleted buffer belongs to DiskIoMgr,
- // so the freed memory should be recorded in the DiskIoMgr mem tracker. So if the tls mem tracker
- // and the DiskIoMgr tracker are different, transfer memory ownership.
- THREAD_MEM_TRACKER_TRANSFER_FROM(bytes_freed, _mem_tracker.get());
-}
-
-void DiskIoMgr::return_free_buffer(BufferDescriptor* desc) {
- return_free_buffer(desc->_buffer, desc->_buffer_len);
-}
-
-void DiskIoMgr::return_free_buffer(char* buffer, int64_t buffer_size) {
- DCHECK(buffer != nullptr);
- int idx = free_buffers_idx(buffer_size);
- DCHECK_EQ(bit_ceil(buffer_size, _min_buffer_size) & ~(1 << idx), 0)
- << "_buffer_size / _min_buffer_size should be power of 2, got buffer_size = "
- << buffer_size << ", _min_buffer_size = " << _min_buffer_size;
- unique_lock<mutex> lock(_free_buffers_lock);
- if (!config::disable_mem_pools && _free_buffers[idx].size() < config::max_free_io_buffers) {
- // The buffer's memory ownership is transferred from desc->buffer_mem_tracker to DiskIoMgr tracker.
- THREAD_MEM_TRACKER_TRANSFER_TO(buffer_size, _mem_tracker.get());
- _free_buffers[idx].push_back(buffer);
- } else {
- --_num_allocated_buffers;
- delete[] buffer;
- }
-}
-
-// This function gets the next RequestRange to work on for this disk. It checks for
-// cancellation and
-// a) Updates ready_to_start_ranges if there are no scan ranges queued for this disk.
-// b) Adds an unstarted write range to _in_flight_ranges. The write range is processed
-// immediately if there are no preceding scan ranges in _in_flight_ranges
-// It blocks until work is available or the thread is shut down.
-// Work is available if there is a RequestContext with
-// - A ScanRange with a buffer available, or
-// - A WriteRange in _unstarted_write_ranges.
-bool DiskIoMgr::get_next_request_range(DiskQueue* disk_queue, RequestRange** range,
- RequestContext** request_context) {
- int disk_id = disk_queue->disk_id;
- *range = nullptr;
-
- // This loops returns either with work to do or when the disk IoMgr shuts down.
- while (!_shut_down) {
- *request_context = nullptr;
- RequestContext::PerDiskState* request_disk_state = nullptr;
- {
- unique_lock<mutex> disk_lock(disk_queue->lock);
-
- while (!_shut_down && disk_queue->request_contexts.empty()) {
- // wait if there are no readers on the queue
- disk_queue->work_available.wait(disk_lock);
- }
- if (_shut_down) {
- break;
- }
- DCHECK(!disk_queue->request_contexts.empty());
-
- // Get the next reader and remove the reader so that another disk thread
- // can't pick it up. It will be enqueued before issuing the read to HDFS
- // so this is not a big deal (i.e. multiple disk threads can read for the
- // same reader).
- // TODO: revisit.
- *request_context = disk_queue->request_contexts.front();
- disk_queue->request_contexts.pop_front();
- DCHECK(*request_context != nullptr);
- request_disk_state = &((*request_context)->_disk_states[disk_id]);
- request_disk_state->increment_request_thread_and_dequeue();
- }
-
- // NOTE: no locks were taken in between. We need to be careful about what state
- // could have changed to the reader and disk in between.
- // There are some invariants here. Only one disk thread can have the
- // same reader here (the reader is removed from the queue). There can be
- // other disk threads operating on this reader in other functions though.
-
- unique_lock<mutex> request_lock((*request_context)->_lock);
- VLOG_FILE << "Disk (id=" << disk_id << ") reading for "
- << (*request_context)->debug_string();
-
- // Check if reader has been cancelled
- if ((*request_context)->_state == RequestContext::Cancelled) {
- request_disk_state->decrement_request_thread_and_check_done(*request_context);
- continue;
- }
-
- DCHECK_EQ((*request_context)->_state, RequestContext::Active)
- << (*request_context)->debug_string();
-
- if (request_disk_state->next_scan_range_to_start() == nullptr &&
- !request_disk_state->unstarted_scan_ranges()->empty()) {
- // We don't have a range queued for this disk for what the caller should
- // read next. Populate that. We want to have one range waiting to minimize
- // wait time in get_next_range.
- ScanRange* new_range = request_disk_state->unstarted_scan_ranges()->dequeue();
- --(*request_context)->_num_unstarted_scan_ranges;
- (*request_context)->_ready_to_start_ranges.enqueue(new_range);
- request_disk_state->set_next_scan_range_to_start(new_range);
-
- if ((*request_context)->_num_unstarted_scan_ranges == 0) {
- // All the ranges have been started, notify everyone blocked on get_next_range.
- // Only one of them will get work so make sure to return nullptr to the other
- // caller threads.
- (*request_context)->_ready_to_start_ranges_cv.notify_all();
- } else {
- (*request_context)->_ready_to_start_ranges_cv.notify_one();
- }
- }
-
- // Always enqueue a WriteRange to be processed into _in_flight_ranges.
- // This is done so _in_flight_ranges does not exclusively contain ScanRanges.
- // For now, enqueuing a WriteRange on each invocation of get_next_request_range()
- // does not flood in_flight_ranges() with WriteRanges because the entire
- // WriteRange is processed and removed from the queue after get_next_request_range()
- // returns. (A DCHECK is used to ensure that writes do not exceed 8MB).
- if (!request_disk_state->unstarted_write_ranges()->empty()) {
- WriteRange* write_range = request_disk_state->unstarted_write_ranges()->dequeue();
- request_disk_state->in_flight_ranges()->enqueue(write_range);
- }
-
- // Get the next scan range to work on from the reader. Only in_flight_ranges
- // are eligible since the disk threads do not start new ranges on their own.
-
- // There are no inflight ranges, nothing to do.
- if (request_disk_state->in_flight_ranges()->empty()) {
- request_disk_state->decrement_request_thread();
- continue;
- }
- DCHECK_GT(request_disk_state->num_remaining_ranges(), 0);
- *range = request_disk_state->in_flight_ranges()->dequeue();
- DCHECK(*range != nullptr);
-
- // Now that we've picked a request range, put the context back on the queue so
- // another thread can pick up another request range for this context.
- request_disk_state->schedule_context(*request_context, disk_id);
- DCHECK((*request_context)->validate()) << endl << (*request_context)->debug_string();
- return true;
- }
-
- DCHECK(_shut_down);
- return false;
-}
-
-void DiskIoMgr::handle_write_finished(RequestContext* writer, WriteRange* write_range,
- const Status& write_status) {
- // Execute the callback before decrementing the thread count. Otherwise cancel_context()
- // that waits for the disk ref count to be 0 will return, creating a race, e.g.
- // between BufferedBlockMgr::WriteComplete() and BufferedBlockMgr::~BufferedBlockMgr().
- // See IMPALA-1890.
- // The status of the write does not affect the status of the writer context.
- write_range->_callback(write_status);
- {
- unique_lock<mutex> writer_lock(writer->_lock);
- DCHECK(writer->validate()) << endl << writer->debug_string();
- RequestContext::PerDiskState& state = writer->_disk_states[write_range->_disk_id];
- if (writer->_state == RequestContext::Cancelled) {
- state.decrement_request_thread_and_check_done(writer);
- } else {
- state.decrement_request_thread();
- }
- --state.num_remaining_ranges();
- }
-}
-
-void DiskIoMgr::handle_read_finished(DiskQueue* disk_queue, RequestContext* reader,
- BufferDescriptor* buffer) {
- unique_lock<mutex> reader_lock(reader->_lock);
-
- RequestContext::PerDiskState& state = reader->_disk_states[disk_queue->disk_id];
- DCHECK(reader->validate()) << endl << reader->debug_string();
- DCHECK_GT(state.num_threads_in_op(), 0);
- DCHECK(buffer->_buffer != nullptr);
-
- if (reader->_state == RequestContext::Cancelled) {
- state.decrement_request_thread_and_check_done(reader);
- DCHECK(reader->validate()) << endl << reader->debug_string();
- return_free_buffer(buffer);
- buffer->_buffer = nullptr;
- buffer->_scan_range->cancel(reader->_status);
- // Enqueue the buffer to use the scan range's buffer cleanup path.
- buffer->_scan_range->enqueue_buffer(buffer);
- return;
- }
-
- DCHECK_EQ(reader->_state, RequestContext::Active);
- DCHECK(buffer->_buffer != nullptr);
-
- // Update the reader's scan ranges. There are a three cases here:
- // 1. Read error
- // 2. End of scan range
- // 3. Middle of scan range
- if (!buffer->_status.ok()) {
- // Error case
- return_free_buffer(buffer);
- buffer->_eosr = true;
- --state.num_remaining_ranges();
- buffer->_scan_range->cancel(buffer->_status);
- } else if (buffer->_eosr) {
- --state.num_remaining_ranges();
- }
-
- // After calling enqueue_buffer(), it is no longer valid to read from buffer.
- // Store the state we need before calling enqueue_buffer().
- bool eosr = buffer->_eosr;
- ScanRange* scan_range = buffer->_scan_range;
- bool queue_full = buffer->_scan_range->enqueue_buffer(buffer);
- if (eosr) {
- // For cached buffers, we can't close the range until the cached buffer is returned.
- // close() is called from DiskIoMgr::return_buffer().
- /*
- * if (scan_range->_cached_buffer == nullptr) {
- * scan_range->close();
- * }
- */
- } else {
- if (queue_full) {
- reader->_blocked_ranges.enqueue(scan_range);
- } else {
- reader->schedule_scan_range(scan_range);
- }
- }
- state.decrement_request_thread();
-}
-
-void DiskIoMgr::work_loop(DiskQueue* disk_queue) {
- // The thread waits until there is work or the entire system is being shut down.
- // If there is work, performs the read or write requested and re-enqueues the
- // requesting context.
- // Locks are not taken when reading from or writing to disk.
- // The main loop has three parts:
- // 1. GetNextRequestContext(): get the next request context (read or write) to
- // process and dequeue it.
- // 2. For the dequeued request, gets the next scan- or write-range to process and
- // re-enqueues the request.
- // 3. Perform the read or write as specified.
- // Cancellation checking needs to happen in both steps 1 and 3.
-
- while (!_shut_down) {
- RequestContext* worker_context = nullptr;
- ;
- RequestRange* range = nullptr;
-
- if (!get_next_request_range(disk_queue, &range, &worker_context)) {
- DCHECK(_shut_down);
- break;
- }
-
- if (range->request_type() == RequestType::READ) {
- read_range(disk_queue, worker_context, static_cast<ScanRange*>(range));
- } else {
- DCHECK(range->request_type() == RequestType::WRITE);
- write(worker_context, static_cast<WriteRange*>(range));
- }
- }
-
- DCHECK(_shut_down);
-}
-
-// This function reads the specified scan range associated with the
-// specified reader context and disk queue.
-void DiskIoMgr::read_range(DiskQueue* disk_queue, RequestContext* reader, ScanRange* range) {
- char* buffer = nullptr;
- int64_t bytes_remaining = range->_len - range->_bytes_read;
- DCHECK_GT(bytes_remaining, 0);
- int64_t buffer_size = std::min(bytes_remaining, static_cast<int64_t>(_max_buffer_size));
- bool enough_memory = _mem_tracker->spare_capacity() > LOW_MEMORY;
- if (!enough_memory) {
- // Low memory, GC and try again.
- gc_io_buffers();
- enough_memory = _mem_tracker->spare_capacity() > LOW_MEMORY;
- }
-
- if (!enough_memory) {
- RequestContext::PerDiskState& state = reader->_disk_states[disk_queue->disk_id];
- unique_lock<mutex> reader_lock(reader->_lock);
-
- // Just grabbed the reader lock, check for cancellation.
- if (reader->_state == RequestContext::Cancelled) {
- DCHECK(reader->validate()) << endl << reader->debug_string();
- state.decrement_request_thread_and_check_done(reader);
- range->cancel(reader->_status);
- DCHECK(reader->validate()) << endl << reader->debug_string();
- return;
- }
-
- if (!range->_ready_buffers.empty()) {
- // We have memory pressure and this range doesn't need another buffer
- // (it already has one queued). Skip this range and pick it up later.
- range->_blocked_on_queue = true;
- reader->_blocked_ranges.enqueue(range);
- state.decrement_request_thread();
- return;
- } else {
- // We need to get a buffer anyway since there are none queued. The query
- // is likely to fail due to mem limits but there's nothing we can do about that
- // now.
- }
- }
-
- buffer = get_free_buffer(&buffer_size);
- ++reader->_num_used_buffers;
-
- // Validate more invariants.
- DCHECK_GT(reader->_num_used_buffers, 0);
- DCHECK(range != nullptr);
- DCHECK(reader != nullptr);
- DCHECK(buffer != nullptr);
-
- BufferDescriptor* buffer_desc = get_buffer_desc(reader, range, buffer, buffer_size);
- DCHECK(buffer_desc != nullptr);
-
- // No locks in this section. Only working on local vars. We don't want to hold a
- // lock across the read call.
- buffer_desc->_status = range->open();
- if (buffer_desc->_status.ok()) {
- // Update counters.
- if (reader->_active_read_thread_counter) {
- reader->_active_read_thread_counter->update(1L);
- }
- if (reader->_disks_accessed_bitmap) {
- int64_t disk_bit = 1 << disk_queue->disk_id;
- reader->_disks_accessed_bitmap->bit_or(disk_bit);
- }
- SCOPED_TIMER(&_read_timer);
- SCOPED_TIMER(reader->_read_timer);
-
- buffer_desc->_status = range->read(buffer, &buffer_desc->_len, &buffer_desc->_eosr);
- buffer_desc->_scan_range_offset = range->_bytes_read - buffer_desc->_len;
-
- if (reader->_bytes_read_counter != nullptr) {
- COUNTER_UPDATE(reader->_bytes_read_counter, buffer_desc->_len);
- }
-
- COUNTER_UPDATE(&_total_bytes_read_counter, buffer_desc->_len);
- if (reader->_active_read_thread_counter) {
- reader->_active_read_thread_counter->update(-1L);
- }
- }
-
- // Finished read, update reader/disk based on the results
- handle_read_finished(disk_queue, reader, buffer_desc);
-}
-
-void DiskIoMgr::write(RequestContext* writer_context, WriteRange* write_range) {
- FILE* file_handle = fopen(write_range->file(), "rb+");
- Status ret_status;
- if (file_handle == nullptr) {
- stringstream error_msg;
- error_msg << "fopen(" << write_range->_file << ", \"rb+\") failed with errno=" << errno
- << " description=" << get_str_err_msg();
- ret_status = Status::InternalError(error_msg.str());
- } else {
- ret_status = write_range_helper(file_handle, write_range);
-
- int success = fclose(file_handle);
- if (ret_status.ok() && success != 0) {
- ret_status = Status::InternalError("fclose({}) failed", write_range->_file);
- }
- }
-
- handle_write_finished(writer_context, write_range, ret_status);
-}
-
-Status DiskIoMgr::write_range_helper(FILE* file_handle, WriteRange* write_range) {
- // Seek to the correct offset and perform the write.
- int success = fseek(file_handle, write_range->offset(), SEEK_SET);
- if (success != 0) {
- return Status::InternalError("fseek({}, {} SEEK_SET) failed with errno={} description={}",
- write_range->_file, write_range->offset(), errno,
- get_str_err_msg());
- }
-
- int64_t bytes_written = fwrite(write_range->_data, 1, write_range->_len, file_handle);
- if (bytes_written < write_range->_len) {
- return Status::InternalError(
- "fwrite(buffer, 1, {}, {}) failed with errno={} description={}", write_range->_len,
- write_range->_file, errno, get_str_err_msg());
- }
-
- return Status::OK();
-}
-
-int DiskIoMgr::free_buffers_idx(int64_t buffer_size) {
- int64_t buffer_size_scaled = bit_ceil(buffer_size, _min_buffer_size);
- int idx = bit_log2(buffer_size_scaled);
- DCHECK_GE(idx, 0);
- DCHECK_LT(idx, _free_buffers.size());
- return idx;
-}
-
-Status DiskIoMgr::add_write_range(RequestContext* writer, WriteRange* write_range) {
- DCHECK_LE(write_range->len(), _max_buffer_size);
- unique_lock<mutex> writer_lock(writer->_lock);
-
- if (writer->_state == RequestContext::Cancelled) {
- DCHECK(!writer->_status.ok());
- return writer->_status;
- }
-
- writer->add_request_range(write_range, false);
- return Status::OK();
-}
-
-/*
- * int DiskIoMgr::AssignQueue(const char* file, int disk_id, bool expected_local) {
- * // If it's a remote range, check for an appropriate remote disk queue.
- * if (!expected_local) {
- * if (IsDfsPath(file) && FLAGS_num_remote_hdfs_io_threads > 0) return RemoteDfsDiskId();
- * if (IsS3APath(file)) return RemoteS3DiskId();
- * }
- * // Assign to a local disk queue.
- * DCHECK(!IsS3APath(file)); // S3 is always remote.
- * if (disk_id == -1) {
- * // disk id is unknown, assign it a random one.
- * static int next_disk_id = 0;
- * disk_id = next_disk_id++;
- * }
- * // TODO: we need to parse the config for the number of dirs configured for this
- * // data node.
- * return disk_id % num_local_disks();
- * }
- */
-
-/*
- * DiskIoMgr::HdfsCachedFileHandle* DiskIoMgr::OpenHdfsFile(const hdfsFS& fs,
- * const char* fname, int64_t mtime) {
- * HdfsCachedFileHandle* fh = nullptr;
- *
- * // Check if a cached file handle exists and validate the mtime, if the mtime of the
- * // cached handle is not matching the mtime of the requested file, reopen.
- * if (detail::is_file_handle_caching_enabled() && _file_handle_cache.Pop(fname, &fh)) {
- * if (fh->mtime() == mtime) {
- * return fh;
- * }
- * VLOG_FILE << "mtime mismatch, closing cached file handle. Closing file=" << fname;
- * delete fh;
- * }
- *
- * fh = new HdfsCachedFileHandle(fs, fname, mtime);
- *
- * // Check if the file handle was opened correctly
- * if (!fh->ok()) {
- * VLOG_FILE << "Opening the file " << fname << " failed.";
- * delete fh;
- * return nullptr;
- * }
- *
- * return fh;
- * }
- */
-
-/*
- * void DiskIoMgr::cache_or_close_file_handle(const char* fname,
- * DiskIoMgr::HdfsCachedFileHandle* fid, bool close) {
- * // Try to unbuffer the handle, on filesystems that do not support this call a non-zero
- * // return code indicates that the operation was not successful and thus the file is
- * // closed.
- * if (detail::is_file_handle_caching_enabled() &&
- * !close && hdfsUnbufferFile(fid->file()) == 0) {
- * // Clear read statistics before returning
- * hdfsFileClearReadStatistics(fid->file());
- * _file_handle_cache.Put(fname, fid);
- * } else {
- * if (close) {
- * VLOG_FILE << "Closing file=" << fname;
- * } else {
- * VLOG_FILE << "FS does not support file handle unbuffering, closing file="
- * << fname;
- * }
- * delete fid;
- * }
- * }
- */
-
-} // namespace doris
diff --git a/be/src/runtime/disk_io_mgr.h b/be/src/runtime/disk_io_mgr.h
deleted file mode 100644
index 9d0aa2f5ae..0000000000
--- a/be/src/runtime/disk_io_mgr.h
+++ /dev/null
@@ -1,837 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/disk-io-mgr.h
-// and modified by Doris
-
-#pragma once
-
-#include <condition_variable>
-#include <list>
-#include <mutex>
-#include <thread>
-#include <unordered_set>
-#include <vector>
-
-#include "common/config.h"
-#include "common/object_pool.h"
-#include "common/status.h"
-#include "runtime/memory/mem_tracker_limiter.h"
-#include "util/error_util.h"
-#include "util/internal_queue.h"
-#include "util/metrics.h"
-#include "util/runtime_profile.h"
-#include "util/thread_group.h"
-
-namespace doris {
-
-class MemTracker;
-
-// Manager object that schedules IO for all queries on all disks and remote filesystems
-// (such as S3). Each query maps to one or more RequestContext objects, each of which
-// has its own queue of scan ranges and/or write ranges.
-//
-// The API splits up requesting scan/write ranges (non-blocking) and reading the data
-// (blocking). The DiskIoMgr has worker threads that will read from and write to
-// disk/hdfs/remote-filesystems, allowing interleaving of IO and CPU. This allows us to
-// keep all disks and all cores as busy as possible.
-//
-// All public APIs are thread-safe. It is not valid to call any of the APIs after
-// unregister_context() returns.
-//
-// For Readers:
-// We can model this problem as a multiple producer (threads for each disk), multiple
-// consumer (scan ranges) problem. There are multiple queues that need to be
-// synchronized. Conceptually, there are two queues:
-// 1. The per disk queue: this contains a queue of readers that need reads.
-// 2. The per scan range ready-buffer queue: this contains buffers that have been
-// read and are ready for the caller.
-// The disk queue contains a queue of readers and is scheduled in a round robin fashion.
-// Readers map to scan nodes. The reader then contains a queue of scan ranges. The caller
-// asks the IoMgr for the next range to process. The IoMgr then selects the best range
-// to read based on disk activity and begins reading and queuing buffers for that range.
-// TODO: We should map readers to queries. A reader is the unit of scheduling and queries
-// that have multiple scan nodes shouldn't have more 'turns'.
-//
-// For Writers:
-// Data is written via add_write_range(). This is non-blocking and adds a WriteRange to a
-// per-disk queue. After the write is complete, a callback in WriteRange is invoked.
-// No memory is allocated within IoMgr for writes and no copies are made. It is the
-// responsibility of the client to ensure that the data to be written is valid and that
-// the file to be written to exists until the callback is invoked.
-//
-// The IoMgr provides three key APIs.
-// 1. add_scan_ranges: this is non-blocking and tells the IoMgr all the ranges that
-// will eventually need to be read.
-// 2. get_next_range: returns to the caller the next scan range it should process.
-// This is based on disk load. This also begins reading the data in this scan
-// range. This is blocking.
-// 3. ScanRange::get_next: returns the next buffer for this range. This is blocking.
-//
-// The disk threads do not synchronize with each other. The readers and writers don't
-// synchronize with each other. There is a lock and condition variable for each request
-// context queue and each disk queue.
-// IMPORTANT: whenever both locks are needed, the lock order is to grab the context lock
-// before the disk lock.
-//
-// Scheduling: If there are multiple request contexts with work for a single disk, the
-// request contexts are scheduled in round-robin order. Multiple disk threads can
-// operate on the same request context. Exactly one request range is processed by a
-// disk thread at a time. If there are multiple scan ranges scheduled via
-// get_next_range() for a single context, these are processed in round-robin order.
-// If there are multiple scan and write ranges for a disk, a read is always followed
-// by a write, and a write is followed by a read, i.e. reads and writes alternate.
-// If multiple write ranges are enqueued for a single disk, they will be processed
-// by the disk threads in order, but may complete in any order. No guarantees are made
-// on ordering of writes across disks.
-//
-// Resource Management: effective resource management in the IoMgr is key to good
-// performance. The IoMgr helps coordinate two resources: CPU and disk. For CPU,
-// spinning up too many threads causes thrashing.
-// Memory usage in the IoMgr comes from queued read buffers. If we queue the minimum
-// (i.e. 1), then the disks are idle while we are processing the buffer. If we don't
-// limit the queue, then it possible we end up queueing the entire data set (i.e. CPU
-// is slower than disks) and run out of memory.
-// For both CPU and memory, we want to model the machine as having a fixed amount of
-// resources. If a single query is running, it should saturate either CPU or Disk
-// as well as using as little memory as possible. With multiple queries, each query
-// should get less CPU. In that case each query will need fewer queued buffers and
-// therefore have less memory usage.
-//
-// The IoMgr defers CPU management to the caller. The IoMgr provides a get_next_range
-// API which will return the next scan range the caller should process. The caller
-// can call this from the desired number of reading threads. Once a scan range
-// has been returned via get_next_range, the IoMgr will start to buffer reads for
-// that range and it is expected the caller will pull those buffers promptly. For
-// example, if the caller would like to have 1 scanner thread, the read loop
-// would look like:
-// while (more_ranges)
-// range = get_next_range()
-// while (!range.eosr)
-// buffer = range.get_next()
-// To have multiple reading threads, the caller would simply spin up the threads
-// and each would process the loops above.
-//
-// To control the number of IO buffers, each scan range has a soft max capacity for
-// the number of queued buffers. If the number of buffers is at capacity, the IoMgr
-// will no longer read for that scan range until the caller has processed a buffer.
-// This capacity does not need to be fixed, and the caller can dynamically adjust
-// it if necessary.
-//
-// As an example: If we allowed 5 buffers per range on a 24 core, 72 thread
-// (we default to allowing 3x threads) machine, we should see at most
-// 72 * 5 * 8MB = 2.8GB in io buffers memory usage. This should remain roughly constant
-// regardless of how many concurrent readers are running.
-//
-// Buffer Management:
-// Buffers are allocated by the IoMgr as necessary to service reads. These buffers
-// are directly returned to the caller. The caller must call Return() on the buffer
-// when it is done, at which point the buffer will be recycled for another read. In error
-// cases, the IoMgr will recycle the buffers more promptly but regardless, the caller
-// must always call Return()
-//
-// Caching support:
-// Scan ranges contain metadata on whether or not it is cached on the DN. In that
-// case, we use the HDFS APIs to read the cached data without doing any copies. For these
-// ranges, the reads happen on the caller thread (as opposed to the disk threads).
-// It is possible for the cached read APIs to fail, in which case the ranges are then
-// queued on the disk threads and behave identically to the case where the range
-// is not cached.
-// Resources for these ranges are also not accounted against the reader because none
-// are consumed.
-// While a cached block is being processed, the block is mlocked. We want to minimize
-// the time the mlock is held.
-// - HDFS will time us out if we hold onto the mlock for too long
-// - Holding the lock prevents uncaching this file due to a caching policy change.
-// Therefore, we only issue the cached read when the caller is ready to process the
-// range (get_next_range()) instead of when the ranges are issued. This guarantees that
-// there will be a CPU available to process the buffer and any throttling we do with
-// the number of scanner threads properly controls the amount of files we mlock.
-// With cached scan ranges, we cannot close the scan range until the cached buffer
-// is returned (HDFS does not allow this). We therefore need to defer the close until
-// the cached buffer is returned (BufferDescriptor::Return()).
-//
-// Remote filesystem support (e.g. S3):
-// Remote filesystems are modeled as "remote disks". That is, there is a separate disk
-// queue for each supported remote filesystem type. In order to maximize throughput,
-// multiple connections are opened in parallel by having multiple threads running per
-// queue. Also note that reading from a remote filesystem service can be more CPU
-// intensive than local disk/hdfs because of non-direct I/O and SSL processing, and can
-// be CPU bottlenecked especially if not enough I/O threads for these queues are
-// started.
-//
-// TODO: IoMgr should be able to request additional scan ranges from the coordinator
-// to help deal with stragglers.
-// TODO: look into using a lock free queue
-// TODO: simplify the common path (less locking, memory allocations).
-// TODO: Break this up the .h/.cc into multiple files under an /io subdirectory.
-//
-// Structure of the Implementation:
-// - All client APIs are defined in this file
-// - Internal classes are defined in disk-io-mgr-internal.h
-// - ScanRange APIs are implemented in disk-io-mgr-scan-range.cc
-// This contains the ready buffer queue logic
-// - RequestContext APIs are implemented in disk-io-mgr-reader-context.cc
-// This contains the logic for picking scan ranges for a reader.
-// - Disk Thread and general APIs are implemented in disk-io-mgr.cc.
-
-typedef void* hdfsFS;
-typedef void* hdfsFile;
-
-class DiskIoMgr {
-public:
- class RequestContext;
- class ScanRange;
-
- // This class is a small wrapper around the hdfsFile handle and the file system
- // instance which is needed to close the file handle in case of eviction. It
- // additionally encapsulates the last modified time of the associated file when it was
- // last opened.
- class HdfsCachedFileHandle {
- public:
- // Constructor will open the file
- HdfsCachedFileHandle(const hdfsFS& fs, const char* fname, int64_t mtime);
-
- // Destructor will close the file handle
- ~HdfsCachedFileHandle();
-
- hdfsFile file() const { return _hdfs_file; }
-
- int64_t mtime() const { return _mtime; }
-
- // This method is called to release acquired resources by the cached handle when it
- // is evicted.
- static void release(HdfsCachedFileHandle** h);
-
- bool ok() const { return _hdfs_file != nullptr; }
-
- private:
- hdfsFS _fs;
- hdfsFile _hdfs_file;
- int64_t _mtime;
- };
-
- // Buffer struct that is used by the caller and IoMgr to pass read buffers.
- // It is expected that only one thread has ownership of this object at a
- // time.
- class BufferDescriptor {
- public:
- // a null dtor to pass codestyle check
- ~BufferDescriptor() {}
-
- ScanRange* scan_range() { return _scan_range; }
- char* buffer() { return _buffer; }
- int64_t buffer_len() { return _buffer_len; }
- int64_t len() { return _len; }
- bool eosr() { return _eosr; }
-
- // Returns the offset within the scan range that this buffer starts at
- int64_t scan_range_offset() const { return _scan_range_offset; }
-
- // Returns the buffer to the IoMgr. This must be called for every buffer
- // returned by get_next()/read() that did not return an error. This is non-blocking.
- // After calling this, the buffer descriptor is invalid and cannot be accessed.
- void return_buffer();
-
- private:
- friend class DiskIoMgr;
- BufferDescriptor(DiskIoMgr* io_mgr);
-
- // Resets the buffer descriptor state for a new reader, range and data buffer.
- void reset(RequestContext* reader, ScanRange* range, char* buffer, int64_t buffer_len);
-
- DiskIoMgr* _io_mgr;
-
- // Reader that this buffer is for
- RequestContext* _reader;
-
- // Scan range that this buffer is for.
- ScanRange* _scan_range;
-
- // buffer with the read contents
- char* _buffer;
-
- // length of _buffer. For buffers from cached reads, the length is 0.
- int64_t _buffer_len;
-
- // length of read contents
- int64_t _len;
-
- // true if the current scan range is complete
- bool _eosr;
-
- // Status of the read to this buffer. if status is not ok, 'buffer' is nullptr
- Status _status;
-
- int64_t _scan_range_offset;
- };
-
- // The request type, read or write associated with a request range.
- struct RequestType {
- enum type {
- READ,
- WRITE,
- };
- };
-
- // Represents a contiguous sequence of bytes in a single file.
- // This is the common base class for read and write IO requests - ScanRange and
- // WriteRange. Each disk thread processes exactly one RequestRange at a time.
- class RequestRange : public InternalQueue<RequestRange>::Node {
- public:
- // hdfsFS fs() const { return _fs; }
- const char* file() const { return _file.c_str(); }
- int64_t offset() const { return _offset; }
- int64_t len() const { return _len; }
- int disk_id() const { return _disk_id; }
- RequestType::type request_type() const { return _request_type; }
-
- protected:
- // Hadoop filesystem that contains _file, or set to nullptr for local filesystem.
- hdfsFS _fs;
-
- // Path to file being read or written.
- std::string _file;
-
- // Offset within _file being read or written.
- int64_t _offset;
-
- // Length of data read or written.
- int64_t _len;
-
- // Id of disk containing byte range.
- int _disk_id;
-
- // The type of IO request, READ or WRITE.
- RequestType::type _request_type;
- };
-
- // ScanRange description. The caller must call Reset() to initialize the fields
- // before calling add_scan_ranges(). The private fields are used internally by
- // the IoMgr.
- class ScanRange : public RequestRange {
- public:
- // If the mtime is set to NEVER_CACHE, the file handle should never be cached.
- const static int64_t NEVER_CACHE = -1;
-
- // The initial queue capacity for this. Specify -1 to use IoMgr default.
- ScanRange() : ScanRange(-1) {}
- ScanRange(int initial_capacity);
-
- virtual ~ScanRange();
-
- // Resets this scan range object with the scan range description. The scan range
- // must fall within the file bounds (offset >= 0 and offset + len <= file_length).
- // Resets this scan range object with the scan range description.
- void reset(hdfsFS fs, const char* file, int64_t len, int64_t offset, int disk_id,
- bool try_cache, bool expected_local, int64_t mtime, void* metadata = nullptr);
-
- void* meta_data() const { return _meta_data; }
- // bool try_cache() const { return _try_cache; }
- bool expected_local() const { return _expected_local; }
- int ready_buffers_capacity() const { return _ready_buffers_capacity; }
-
- // Returns the next buffer for this scan range. buffer is an output parameter.
- // This function blocks until a buffer is ready or an error occurred. If this is
- // called when all buffers have been returned, *buffer is set to nullptr and Status::OK()
- // is returned.
- // Only one thread can be in get_next() at any time.
- Status get_next(BufferDescriptor** buffer);
-
- // Cancel this scan range. This cleans up all queued buffers and
- // wakes up any threads blocked on get_next().
- // Status is the reason the range was cancelled. Must not be ok().
- // Status is returned to the user in get_next().
- void cancel(const Status& status);
-
- // return a descriptive string for debug.
- std::string debug_string() const;
-
- int64_t mtime() const { return _mtime; }
-
- private:
- friend class DiskIoMgr;
-
- // Initialize internal fields
- void init_internal(DiskIoMgr* io_mgr, RequestContext* reader);
-
- // Enqueues a buffer for this range. This does not block.
- // Returns true if this scan range has hit the queue capacity, false otherwise.
- // The caller passes ownership of buffer to the scan range and it is not
- // valid to access buffer after this call.
- bool enqueue_buffer(BufferDescriptor* buffer);
-
- // Cleanup any queued buffers (i.e. due to cancellation). This cannot
- // be called with any locks taken.
- void cleanup_queued_buffers();
-
- // Validates the internal state of this range. _lock must be taken
- // before calling this.
- bool validate();
-
- // Maximum length in bytes for hdfsRead() calls.
- int64_t max_read_chunk_size() const;
-
- // Opens the file for this range. This function only modifies state in this range.
- Status open();
-
- // Closes the file for this range. This function only modifies state in this range.
- void close();
-
- // Reads from this range into 'buffer'. Buffer is preallocated. Returns the number
- // of bytes read. Updates range to keep track of where in the file we are.
- Status read(char* buffer, int64_t* bytes_read, bool* eosr);
-
- // Reads from the DN cache. On success, sets _cached_buffer to the DN buffer
- // and *read_succeeded to true.
- // If the data is not cached, returns ok() and *read_succeeded is set to false.
- // Returns a non-ok status if it ran into a non-continuable error.
- Status read_from_cache(bool* read_succeeded);
-
- // Pointer to caller specified metadata. This is untouched by the io manager
- // and the caller can put whatever auxiliary data in here.
- void* _meta_data;
-
- // If true, this scan range is expected to be cached. Note that this might be wrong
- // since the block could have been uncached. In that case, the cached path
- // will fail and we'll just put the scan range on the normal read path.
- bool _try_cache;
-
- // If true, we expect this scan range to be a local read. Note that if this is false,
- // it does not necessarily mean we expect the read to be remote, and that we never
- // create scan ranges where some of the range is expected to be remote and some of it
- // local.
- // TODO: we can do more with this
- bool _expected_local;
-
- DiskIoMgr* _io_mgr;
-
- // Reader/owner of the scan range
- RequestContext* _reader;
-
- // File handle either to hdfs or local fs (FILE*)
- //
- // TODO: The pointer to HdfsCachedFileHandle is manually managed and should be
- // replaced by unique_ptr in C++11
- union {
- FILE* _local_file;
- HdfsCachedFileHandle* _hdfs_file;
- };
-
- // If non-null, this is DN cached buffer. This means the cached read succeeded
- // and all the bytes for the range are in this buffer.
- // TODO(zxy) Not used, maybe delete
- struct hadoopRzBuffer* _cached_buffer;
-
- // Lock protecting fields below.
- // This lock should not be taken during Open/Read/Close.
- std::mutex _lock;
-
- // Number of bytes read so far for this scan range
- int _bytes_read;
-
- // Status for this range. This is non-ok if _is_cancelled is true.
- // Note: an individual range can fail without the RequestContext being
- // cancelled. This allows us to skip individual ranges.
- Status _status;
-
- // If true, the last buffer for this scan range has been queued.
- bool _eosr_queued;
-
- // If true, the last buffer for this scan range has been returned.
- bool _eosr_returned;
-
- // If true, this scan range has been removed from the reader's in_flight_ranges
- // queue because the _ready_buffers queue is full.
- bool _blocked_on_queue;
-
- // IO buffers that are queued for this scan range.
- // Condition variable for get_next
- std::condition_variable _buffer_ready_cv;
- std::list<BufferDescriptor*> _ready_buffers;
-
- // The soft capacity limit for _ready_buffers. _ready_buffers can exceed
- // the limit temporarily as the capacity is adjusted dynamically.
- // In that case, the capacity is only realized when the caller removes buffers
- // from _ready_buffers.
- int _ready_buffers_capacity;
-
- // Lock that should be taken during hdfs calls. Only one thread (the disk reading
- // thread) calls into hdfs at a time so this lock does not have performance impact.
- // This lock only serves to coordinate cleanup. Specifically it serves to ensure
- // that the disk threads are finished with HDFS calls before _is_cancelled is set
- // to true and cleanup starts.
- // If this lock and _lock need to be taken, _lock must be taken first.
- std::mutex _hdfs_lock;
-
- // If true, this scan range has been cancelled.
- bool _is_cancelled;
-
- // Last modified time of the file associated with the scan range
- int64_t _mtime;
- };
-
- // Used to specify data to be written to a file and offset.
- // It is the responsibility of the client to ensure that the data to be written is
- // valid and that the file to be written to exists until the callback is invoked.
- // A callback is invoked to inform the client when the write is done.
- class WriteRange : public RequestRange {
- public:
- // a null dtor to pass codestyle check
- ~WriteRange() {}
-
- // This callback is invoked on each WriteRange after the write is complete or the
- // context is cancelled. The status returned by the callback parameter indicates
- // if the write was successful (i.e. Status::OK()), if there was an error
- // TStatusCode::RUNTIME_ERROR) or if the context was cancelled
- // (TStatusCode::CANCELLED). The callback is only invoked if this WriteRange was
- // successfully added (i.e. add_write_range() succeeded). No locks are held while
- // the callback is invoked.
- typedef std::function<void(const Status&)> WriteDoneCallback;
- WriteRange(const std::string& file, int64_t file_offset, int disk_id,
- WriteDoneCallback callback);
-
- // Set the data and number of bytes to be written for this WriteRange.
- // File data can be over-written by calling set_data() and add_write_range().
- void set_data(const uint8_t* buffer, int64_t len);
-
- private:
- friend class DiskIoMgr;
-
- // Data to be written. RequestRange::_len contains the length of data
- // to be written.
- const uint8_t* _data;
-
- // Callback to invoke after the write is complete.
- WriteDoneCallback _callback;
- };
-
- // Create a DiskIoMgr object.
- // - num_disks: The number of disks the IoMgr should use. This is used for testing.
- // Specify 0, to have the disk IoMgr query the os for the number of disks.
- // - threads_per_disk: number of read threads to create per disk. This is also
- // the max queue depth.
- // - min_buffer_size: minimum io buffer size (in bytes)
- // - max_buffer_size: maximum io buffer size (in bytes). Also the max read size.
- DiskIoMgr(int num_disks, int threads_per_disk, int min_buffer_size, int max_buffer_size);
-
- // Create DiskIoMgr with default configs.
- DiskIoMgr();
-
- // Clean up all threads and resources. This is mostly useful for testing since
- // for impalad, this object is never destroyed.
- ~DiskIoMgr();
-
- // Initialize the IoMgr. Must be called once before any of the other APIs.
- Status init(const int64_t mem_limit);
-
- // Allocates tracking structure for a request context.
- // Register a new request context which is returned in *request_context.
- // The IoMgr owns the allocated RequestContext object. The caller must call
- // unregister_context() for each context.
- // reader_mem_tracker: Is non-null only for readers. IO buffers
- // used for this reader will be tracked by this. If the limit is exceeded
- // the reader will be cancelled and MEM_LIMIT_EXCEEDED will be returned via
- // get_next().
- Status register_context(RequestContext** request_context);
-
- // Unregisters context from the disk IoMgr. This must be called for every
- // register_context() regardless of cancellation and must be called in the
- // same thread as get_next()
- // The 'context' cannot be used after this call.
- // This call blocks until all the disk threads have finished cleaning up.
- // unregister_context also cancels the reader/writer from the disk IoMgr.
- void unregister_context(RequestContext* context);
-
- // This function cancels the context asynchronously. All outstanding requests
- // are aborted and tracking structures cleaned up. This does not need to be
- // called if the context finishes normally.
- // This will also fail any outstanding get_next()/Read requests.
- // If wait_for_disks_completion is true, wait for the number of active disks for this
- // context to reach 0. After calling with wait_for_disks_completion = true, the only
- // valid API is returning IO buffers that have already been returned.
- // Takes context->_lock if wait_for_disks_completion is true.
- void cancel_context(RequestContext* context, bool wait_for_disks_completion = false);
-
- // Adds the scan ranges to the queues. This call is non-blocking. The caller must
- // not deallocate the scan range pointers before unregister_context().
- // If schedule_immediately, the ranges are immediately put on the read queue
- // (i.e. the caller should not/cannot call get_next_range for these ranges).
- // This can be used to do synchronous reads as well as schedule dependent ranges,
- // as in the case for columnar formats.
- Status add_scan_ranges(RequestContext* reader, const std::vector<ScanRange*>& ranges,
- bool schedule_immediately = false);
-
- // Add a WriteRange for the writer. This is non-blocking and schedules the context
- // on the IoMgr disk queue. Does not create any files.
- Status add_write_range(RequestContext* writer, WriteRange* write_range);
-
- // Returns the next unstarted scan range for this reader. When the range is returned,
- // the disk threads in the IoMgr will already have started reading from it. The
- // caller is expected to call ScanRange::get_next on the returned range.
- // If there are no more unstarted ranges, nullptr is returned.
- // This call is blocking.
- Status get_next_range(RequestContext* reader, ScanRange** range);
-
- // Reads the range and returns the result in buffer.
- // This behaves like the typical synchronous read() api, blocking until the data
- // is read. This can be called while there are outstanding ScanRanges and is
- // thread safe. Multiple threads can be calling read() per reader at a time.
- // range *cannot* have already been added via add_scan_ranges.
- Status read(RequestContext* reader, ScanRange* range, BufferDescriptor** buffer);
-
- // Determine which disk queue this file should be assigned to. Returns an index into
- // _disk_queues. The disk_id is the volume ID for the local disk that holds the
- // files, or -1 if unknown. Flag expected_local is true iff this impalad is
- // co-located with the datanode for this file.
- /*
- * int AssignQueue(const char* file, int disk_id, bool expected_local);
- */
-
- // TODO: The functions below can be moved to RequestContext.
- // Returns the current status of the context.
- Status context_status(RequestContext* context) const;
-
- // Returns the number of unstarted scan ranges for this reader.
- int num_unstarted_ranges(RequestContext* reader) const;
-
- void set_bytes_read_counter(RequestContext*, RuntimeProfile::Counter*);
- void set_read_timer(RequestContext*, RuntimeProfile::Counter*);
- void set_active_read_thread_counter(RequestContext*, RuntimeProfile::Counter*);
- void set_disks_access_bitmap(RequestContext*, RuntimeProfile::Counter*);
-
- int64_t queue_size(RequestContext* reader) const;
- int64_t bytes_read_local(RequestContext* reader) const;
- int64_t bytes_read_short_circuit(RequestContext* reader) const;
- int64_t bytes_read_dn_cache(RequestContext* reader) const;
- int num_remote_ranges(RequestContext* reader) const;
- int64_t unexpected_remote_bytes(RequestContext* reader) const;
- MemTrackerLimiter* mem_tracker() const { return _mem_tracker.get(); }
-
- // Returns the read throughput across all readers.
- // TODO: should this be a sliding window? This should report metrics for the
- // last minute, hour and since the beginning.
- int64_t get_read_throughput();
-
- // Returns the maximum read buffer size
- int max_read_buffer_size() const { return _max_buffer_size; }
-
- // Returns the total number of disk queues (both local and remote).
- int num_total_disks() const { return _disk_queues.size(); }
-
- // Returns the total number of remote "disk" queues.
- int num_remote_disks() const { return REMOTE_NUM_DISKS; }
-
- // Returns the number of local disks attached to the system.
- int num_local_disks() const { return num_total_disks() - num_remote_disks(); }
-
- // The disk ID (and therefore _disk_queues index) used for DFS accesses.
- // int RemoteDfsDiskId() const { return num_local_disks() + REMOTE_DFS_DISK_OFFSET; }
-
- // The disk ID (and therefore _disk_queues index) used for S3 accesses.
- // int RemoteS3DiskId() const { return num_local_disks() + REMOTE_S3_DISK_OFFSET; }
-
- // Returns the number of allocated buffers.
- int num_allocated_buffers() const { return _num_allocated_buffers; }
-
- // Returns the number of buffers currently owned by all readers.
- int num_buffers_in_readers() const { return _num_buffers_in_readers; }
-
- // Dumps the disk IoMgr queues (for readers and disks)
- std::string debug_string();
-
- // Validates the internal state is consistent. This is intended to only be used
- // for debugging.
- bool validate() const;
-
- // Given a FS handle, name and last modified time of the file, tries to open that file
- // and return an instance of HdfsCachedFileHandle. In case of an error returns nullptr.
- // HdfsCachedFileHandle* OpenHdfsFile(const hdfsFS& fs, const char* fname, int64_t mtime);
-
- // When the file handle is no longer in use by the scan range, return it and try to
- // unbuffer the handle. If unbuffering, closing sockets and dropping buffers in the
- // libhdfs client, is not supported, close the file handle. If the unbuffer operation
- // is supported, put the file handle together with the mtime in the LRU cache for
- // later reuse.
- // void cache_or_close_file_handle(const char* fname, HdfsCachedFileHandle* fid, bool close);
-
- // Default ready buffer queue capacity. This constant doesn't matter too much
- // since the system dynamically adjusts.
- static const int DEFAULT_QUEUE_CAPACITY;
-
- // "Disk" queue offsets for remote accesses. Offset 0 corresponds to
- // disk ID (i.e. _disk_queue index) of num_local_disks().
- enum { REMOTE_DFS_DISK_OFFSET = 0, REMOTE_S3_DISK_OFFSET, REMOTE_NUM_DISKS };
-
-private:
- friend class BufferDescriptor;
- struct DiskQueue;
- class RequestContextCache;
-
- // Pool to allocate BufferDescriptors.
- ObjectPool _pool;
-
- std::unique_ptr<MemTrackerLimiter> _mem_tracker;
-
- // Number of worker(read) threads per disk. Also the max depth of queued
- // work to the disk.
- const int _num_threads_per_disk;
-
- // Maximum read size. This is also the maximum size of each allocated buffer.
- const int _max_buffer_size;
-
- // The minimum size of each read buffer.
- const int _min_buffer_size;
-
- // Thread group containing all the worker threads.
- // ThreadGroup _disk_thread_group;
- ThreadGroup _disk_thread_group;
-
- // Options object for cached hdfs reads. Set on startup and never modified.
- struct hadoopRzOptions* _cached_read_options;
-
- // True if the IoMgr should be torn down. Worker threads watch for this to
- // know to terminate. This variable is read/written to by different threads.
- std::atomic<bool> _shut_down;
-
- // Total bytes read by the IoMgr.
- RuntimeProfile::Counter _total_bytes_read_counter;
-
- // Total time spent in hdfs reading
- RuntimeProfile::Counter _read_timer;
-
- // Contains all contexts that the IoMgr is tracking. This includes contexts that are
- // active as well as those in the process of being cancelled. This is a cache
- // of context objects that get recycled to minimize object allocations and lock
- // contention.
- std::unique_ptr<RequestContextCache> _request_context_cache;
-
- // Protects _free_buffers and _free_buffer_descs
- std::mutex _free_buffers_lock;
-
- // Free buffers that can be handed out to clients. There is one list for each buffer
- // size, indexed by the Log2 of the buffer size in units of _min_buffer_size. The
- // maximum buffer size is _max_buffer_size, so the maximum index is
- // Log2(_max_buffer_size / _min_buffer_size).
- //
- // E.g. if _min_buffer_size = 1024 bytes:
- // _free_buffers[0] => list of free buffers with size 1024 B
- // _free_buffers[1] => list of free buffers with size 2048 B
- // _free_buffers[10] => list of free buffers with size 1 MB
- // _free_buffers[13] => list of free buffers with size 8 MB
- // _free_buffers[n] => list of free buffers with size 2^n * 1024 B
- std::vector<std::list<char*>> _free_buffers;
-
- // List of free buffer desc objects that can be handed out to clients
- std::list<BufferDescriptor*> _free_buffer_descs;
-
- // Total number of allocated buffers, used for debugging.
- std::atomic<int> _num_allocated_buffers {0};
-
- // Total number of buffers in readers
- std::atomic<int> _num_buffers_in_readers {0};
-
- // Per disk queues. This is static and created once at init() time. One queue is
- // allocated for each local disk on the system and for each remote filesystem type.
- // It is indexed by disk id.
- std::vector<DiskQueue*> _disk_queues;
-
- // Caching structure that maps file names to cached file handles. The cache has an upper
- // limit of entries defined by FLAGS_max_cached_file_handles. Evicted cached file
- // handles are closed.
- // FifoMultimap<std::string, HdfsCachedFileHandle*> _file_handle_cache;
- std::multimap<std::string, HdfsCachedFileHandle*> _file_handle_cache;
-
- // Returns the index into _free_buffers for a given buffer size
- int free_buffers_idx(int64_t buffer_size);
-
- // Gets a buffer description object, initialized for this reader, allocating one as
- // necessary. buffer_size / _min_buffer_size should be a power of 2, and buffer_size
- // should be <= _max_buffer_size. These constraints will be met if buffer was acquired
- // via get_free_buffer() (which it should have been).
- BufferDescriptor* get_buffer_desc(RequestContext* reader, ScanRange* range, char* buffer,
- int64_t buffer_size);
-
- // Returns a buffer desc object which can now be used for another reader.
- void return_buffer_desc(BufferDescriptor* desc);
-
- // Returns the buffer desc and underlying buffer to the disk IoMgr. This also updates
- // the reader and disk queue state.
- void return_buffer(BufferDescriptor* buffer);
-
- // Returns a buffer to read into with size between *buffer_size and _max_buffer_size,
- // and *buffer_size is set to the size of the buffer. If there is an
- // appropriately-sized free buffer in the '_free_buffers', that is returned, otherwise
- // a new one is allocated. *buffer_size must be between 0 and _max_buffer_size.
- char* get_free_buffer(int64_t* buffer_size);
-
- // Garbage collect all unused io buffers. This is currently only triggered when the
- // process wide limit is hit.
- // TODO: make this run periodically?
- void gc_io_buffers(int64_t bytes_to_free = INT_MAX);
-
- // Returns a buffer to the free list. buffer_size / _min_buffer_size should be a power
- // of 2, and buffer_size should be <= _max_buffer_size. These constraints will be met
- // if buffer was acquired via get_free_buffer() (which it should have been).
- void return_free_buffer(char* buffer, int64_t buffer_size);
-
- // Returns the buffer in desc (cannot be nullptr), sets buffer to nullptr
- void return_free_buffer(BufferDescriptor* desc);
-
- // Disk worker thread loop. This function retrieves the next range to process on
- // the disk queue and invokes read_range() or Write() depending on the type of Range().
- // There can be multiple threads per disk running this loop.
- void work_loop(DiskQueue* queue);
-
- // This is called from the disk thread to get the next range to process. It will
- // wait until a scan range and buffer are available, or a write range is available.
- // This functions returns the range to process.
- // Only returns false if the disk thread should be shut down.
- // No locks should be taken before this function call and none are left taken after.
- bool get_next_request_range(DiskQueue* disk_queue, RequestRange** range,
- RequestContext** request_context);
-
- // Updates disk queue and reader state after a read is complete. The read result
- // is captured in the buffer descriptor.
- void handle_read_finished(DiskQueue*, RequestContext*, BufferDescriptor*);
-
- // Invokes write_range->_callback after the range has been written and
- // updates per-disk state and handle state. The status of the write OK/RUNTIME_ERROR
- // etc. is passed via write_status and to the callback.
- // The write_status does not affect the writer->_status. That is, an write error does
- // not cancel the writer context - that decision is left to the callback handler.
- // TODO: On the read path, consider not canceling the reader context on error.
- void handle_write_finished(RequestContext* writer, WriteRange* write_range,
- const Status& write_status);
-
- // Validates that range is correctly initialized
- Status validate_scan_range(ScanRange* range);
-
- // Write the specified range to disk and calls handle_write_finished when done.
- // Responsible for opening and closing the file that is written.
- void write(RequestContext* writer_context, WriteRange* write_range);
-
- // Helper method to write a range using the specified FILE handle. Returns Status:OK
- // if the write succeeded, or a RUNTIME_ERROR with an appropriate message otherwise.
- // Does not open or close the file that is written.
- Status write_range_helper(FILE* file_handle, WriteRange* write_range);
-
- // Reads the specified scan range and calls handle_read_finished when done.
- void read_range(DiskQueue* disk_queue, RequestContext* reader, ScanRange* range);
-};
-
-} // end namespace doris
diff --git a/be/src/runtime/disk_io_mgr_internal.h b/be/src/runtime/disk_io_mgr_internal.h
deleted file mode 100644
index 46b74aa946..0000000000
--- a/be/src/runtime/disk_io_mgr_internal.h
+++ /dev/null
@@ -1,455 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/disk-io-mgr-internal.h
-// and modified by Doris
-
-#pragma once
-
-#include <unistd.h>
-
-#include <queue>
-
-#include "common/logging.h"
-#include "common/status.h"
-#include "disk_io_mgr.h"
-#include "util/cpu_info.h"
-#include "util/debug_util.h"
-#include "util/disk_info.h"
-#include "util/filesystem_util.h"
-
-// This file contains internal structures to the IoMgr. Users of the IoMgr do
-// not need to include this file.
-namespace doris {
-
-// Per disk state
-struct DiskIoMgr::DiskQueue {
- // Disk id (0-based)
- int disk_id;
-
- // Lock that protects access to 'request_contexts' and 'work_available'
- std::mutex lock;
-
- // Condition variable to signal the disk threads that there is work to do or the
- // thread should shut down. A disk thread will be woken up when there is a reader
- // added to the queue. A reader is only on the queue when it has at least one
- // scan range that is not blocked on available buffers.
- std::condition_variable work_available;
-
- // list of all request contexts that have work queued on this disk
- std::list<RequestContext*> request_contexts;
-
- // Enqueue the request context to the disk queue. The DiskQueue lock must not be taken.
- void enqueue_context(RequestContext* worker) {
- {
- std::unique_lock<std::mutex> disk_lock(lock);
- // Check that the reader is not already on the queue
- DCHECK(find(request_contexts.begin(), request_contexts.end(), worker) ==
- request_contexts.end());
- request_contexts.push_back(worker);
- }
- work_available.notify_all();
- }
-
- DiskQueue(int id) : disk_id(id) {}
-};
-
-// Internal per request-context state. This object maintains a lot of state that is
-// carefully synchronized. The context maintains state across all disks as well as
-// per disk state.
-// The unit for an IO request is a RequestRange, which may be a ScanRange or a
-// WriteRange.
-// A scan range for the reader is on one of five states:
-// 1) PerDiskState's unstarted_ranges: This range has only been queued
-// and nothing has been read from it.
-// 2) RequestContext's _ready_to_start_ranges: This range is about to be started.
-// As soon as the reader picks it up, it will move to the in_flight_ranges
-// queue.
-// 3) PerDiskState's in_flight_ranges: This range is being processed and will
-// be read from the next time a disk thread picks it up in get_next_request_range()
-// 4) ScanRange's outgoing ready buffers is full. We can't read for this range
-// anymore. We need the caller to pull a buffer off which will put this in
-// the in_flight_ranges queue. These ranges are in the RequestContext's
-// _blocked_ranges queue.
-// 5) ScanRange is cached and in the _cached_ranges queue.
-//
-// If the scan range is read and does not get blocked on the outgoing queue, the
-// transitions are: 1 -> 2 -> 3.
-// If the scan range does get blocked, the transitions are
-// 1 -> 2 -> 3 -> (4 -> 3)*
-//
-// In the case of a cached scan range, the range is immediately put in _cached_ranges.
-// When the caller asks for the next range to process, we first pull ranges from
-// the _cache_ranges queue. If the range was cached, the range is removed and
-// done (ranges are either entirely cached or not at all). If the cached read attempt
-// fails, we put the range in state 1.
-//
-// A write range for a context may be in one of two lists:
-// 1) _unstarted_write_ranges : Ranges that have been queued but not processed.
-// 2) _in_flight_ranges: The write range is ready to be processed by the next disk thread
-// that picks it up in get_next_request_range().
-//
-// AddWriteRange() adds WriteRanges for a disk.
-// It is the responsibility of the client to pin the data to be written via a WriteRange
-// in memory. After a WriteRange has been written, a callback is invoked to inform the
-// client that the write has completed.
-//
-// An important assumption is that write does not exceed the maximum read size and that
-// the entire range is written when the write request is handled. (In other words, writes
-// are not broken up.)
-//
-// When a RequestContext is processed by a disk thread in get_next_request_range(), a write
-// range is always removed from the list of unstarted write ranges and appended to the
-// _in_flight_ranges queue. This is done to alternate reads and writes - a read that is
-// scheduled (by calling GetNextRange()) is always followed by a write (if one exists).
-// And since at most one WriteRange can be present in _in_flight_ranges at any time
-// (once a write range is returned from GetNetxRequestRange() it is completed and not
-// re-enqueued), a scan range scheduled via a call to GetNextRange() can be queued up
-// behind at most one write range.
-class DiskIoMgr::RequestContext {
-public:
- enum State {
- // Reader is initialized and maps to a client
- Active,
-
- // Reader is in the process of being cancelled. Cancellation is coordinated between
- // different threads and when they are all complete, the reader context is moved to
- // the inactive state.
- Cancelled,
-
- // Reader context does not map to a client. Accessing memory in this context
- // is invalid (i.e. it is equivalent to a dangling pointer).
- Inactive,
- };
-
- RequestContext(DiskIoMgr* parent, int num_disks);
-
- // Resets this object.
- void reset();
-
- // Decrements the number of active disks for this reader. If the disk count
- // goes to 0, the disk complete condition variable is signaled.
- // Reader lock must be taken before this call.
- void decrement_disk_ref_count() {
- // boost doesn't let us dcheck that the reader lock is taken
- DCHECK_GT(_num_disks_with_ranges, 0);
- if (--_num_disks_with_ranges == 0) {
- _disks_complete_cond_var.notify_one();
- }
- DCHECK(validate()) << std::endl << debug_string();
- }
-
- // Reader & Disk Scheduling: Readers that currently can't do work are not on
- // the disk's queue. These readers are ones that don't have any ranges in the
- // in_flight_queue AND have not prepared a range by setting next_range_to_start.
- // The rule to make sure readers are scheduled correctly is to ensure anytime a
- // range is put on the in_flight_queue or anytime next_range_to_start is set to
- // nullptr, the reader is scheduled.
-
- // Adds range to in_flight_ranges, scheduling this reader on the disk threads
- // if necessary.
- // Reader lock must be taken before this.
- void schedule_scan_range(DiskIoMgr::ScanRange* range) {
- DCHECK_EQ(_state, Active);
- DCHECK(range != nullptr);
- RequestContext::PerDiskState& state = _disk_states[range->disk_id()];
- state.in_flight_ranges()->enqueue(range);
- state.schedule_context(this, range->disk_id());
- }
-
- // Cancels the context with status code 'status'.
- void cancel(const Status& status);
-
- // Adds request range to disk queue for this request context. Currently,
- // schedule_immediately must be false is RequestRange is a write range.
- void add_request_range(DiskIoMgr::RequestRange* range, bool schedule_immediately);
-
- // Returns the default queue capacity for scan ranges. This is updated
- // as the reader processes ranges.
- int initial_scan_range_queue_capacity() const { return _initial_queue_capacity; }
-
- // Validates invariants of reader. Reader lock must be taken beforehand.
- bool validate() const;
-
- // Dumps out reader information. Lock should be taken by caller
- std::string debug_string() const;
-
-private:
- friend class DiskIoMgr;
- class PerDiskState;
-
- // Parent object
- DiskIoMgr* _parent;
-
- // Total bytes read for this reader
- RuntimeProfile::Counter* _bytes_read_counter;
-
- // Total time spent in hdfs reading
- RuntimeProfile::Counter* _read_timer;
-
- // Number of active read threads
- RuntimeProfile::Counter* _active_read_thread_counter;
-
- // Disk access bitmap. The counter's bit[i] is set if disk id i has been accessed.
- // TODO: we can only support up to 64 disks with this bitmap but it lets us use a
- // builtin atomic instruction. Probably good enough for now.
- RuntimeProfile::Counter* _disks_accessed_bitmap;
-
- // Total number of bytes read locally, updated at end of each range scan
- std::atomic<int64_t> _bytes_read_local {0};
-
- // Total number of bytes read via short circuit read, updated at end of each range scan
- std::atomic<int64_t> _bytes_read_short_circuit {0};
-
- // Total number of bytes read from date node cache, updated at end of each range scan
- std::atomic<int64_t> _bytes_read_dn_cache {0};
-
- // Total number of bytes from remote reads that were expected to be local.
- std::atomic<int64_t> _unexpected_remote_bytes {0};
-
- // The number of buffers that have been returned to the reader (via get_next) that the
- // reader has not returned. Only included for debugging and diagnostics.
- std::atomic<int> _num_buffers_in_reader {0};
-
- // The number of scan ranges that have been completed for this reader.
- std::atomic<int> _num_finished_ranges {0};
-
- // The number of scan ranges that required a remote read, updated at the end of each
- // range scan. Only used for diagnostics.
- std::atomic<int> _num_remote_ranges {0};
-
- // The total number of scan ranges that have not been started. Only used for
- // diagnostics. This is the sum of all unstarted_scan_ranges across all disks.
- std::atomic<int> _num_unstarted_scan_ranges {0};
-
- // The number of buffers that are being used for this reader. This is the sum
- // of all buffers in ScanRange queues and buffers currently being read into (i.e. about
- // to be queued).
- std::atomic<int> _num_used_buffers {0};
-
- // The total number of ready buffers across all ranges. Ready buffers are buffers
- // that have been read from disk but not retrieved by the caller.
- // This is the sum of all queued buffers in all ranges for this reader context.
- std::atomic<int> _num_ready_buffers {0};
-
- // The total (sum) of queue capacities for finished scan ranges. This value
- // divided by _num_finished_ranges is the average for finished ranges and
- // used to seed the starting queue capacity for future ranges. The assumption
- // is that if previous ranges were fast, new ones will be fast too. The scan
- // range adjusts the queue capacity dynamically so a rough approximation will do.
- std::atomic<int> _total_range_queue_capacity {0};
-
- // The initial queue size for new scan ranges. This is always
- // _total_range_queue_capacity / _num_finished_ranges but stored as a separate
- // variable to allow reading this value without taking a lock. Doing the division
- // at read time (with no lock) could lead to a race where only
- // _total_range_queue_capacity or _num_finished_ranges was updated.
- int _initial_queue_capacity;
-
- // All fields below are accessed by multiple threads and the lock needs to be
- // taken before accessing them.
- std::mutex _lock;
-
- // Current state of the reader
- State _state;
-
- // Status of this reader. Set to non-ok if cancelled.
- Status _status;
-
- // The number of disks with scan ranges remaining (always equal to the sum of
- // disks with ranges).
- int _num_disks_with_ranges;
-
- // This is the list of ranges that are expected to be cached on the DN.
- // When the reader asks for a new range (GetNextScanRange()), we first
- // return ranges from this list.
- InternalQueue<ScanRange> _cached_ranges;
-
- // A list of ranges that should be returned in subsequent calls to
- // GetNextRange.
- // There is a trade-off with when to populate this list. Populating it on
- // demand means consumers need to wait (happens in DiskIoMgr::GetNextRange()).
- // Populating it preemptively means we make worse scheduling decisions.
- // We currently populate one range per disk.
- // TODO: think about this some more.
- InternalQueue<ScanRange> _ready_to_start_ranges;
- std::condition_variable _ready_to_start_ranges_cv; // used with _lock
-
- // Ranges that are blocked due to back pressure on outgoing buffers.
- InternalQueue<ScanRange> _blocked_ranges;
-
- // Condition variable for UnregisterContext() to wait for all disks to complete
- std::condition_variable _disks_complete_cond_var;
-
- // Struct containing state per disk. See comments in the disk read loop on how
- // they are used.
- class PerDiskState {
- public:
- bool done() const { return _done; }
- void set_done(bool b) { _done = b; }
-
- int num_remaining_ranges() const { return _num_remaining_ranges; }
- int& num_remaining_ranges() { return _num_remaining_ranges; }
-
- ScanRange* next_scan_range_to_start() { return _next_scan_range_to_start; }
- void set_next_scan_range_to_start(ScanRange* range) { _next_scan_range_to_start = range; }
-
- // We need to have a memory barrier to prevent this load from being reordered
- // with num_threads_in_op(), since these variables are set without the reader
- // lock taken
- bool is_on_queue() const {
- bool b = _is_on_queue;
- __sync_synchronize();
- return b;
- }
-
- int num_threads_in_op() const {
- int v = _num_threads_in_op;
- __sync_synchronize();
- return v;
- }
-
- const InternalQueue<ScanRange>* unstarted_scan_ranges() const {
- return &_unstarted_scan_ranges;
- }
- const InternalQueue<WriteRange>* unstarted_write_ranges() const {
- return &_unstarted_write_ranges;
- }
- const InternalQueue<RequestRange>* in_flight_ranges() const { return &_in_flight_ranges; }
-
- InternalQueue<ScanRange>* unstarted_scan_ranges() { return &_unstarted_scan_ranges; }
- InternalQueue<WriteRange>* unstarted_write_ranges() { return &_unstarted_write_ranges; }
- InternalQueue<RequestRange>* in_flight_ranges() { return &_in_flight_ranges; }
-
- PerDiskState() { reset(); }
-
- // Schedules the request context on this disk if it's not already on the queue.
- // Context lock must be taken before this.
- void schedule_context(RequestContext* context, int disk_id) {
- if (!_is_on_queue && !_done) {
- _is_on_queue = true;
- context->_parent->_disk_queues[disk_id]->enqueue_context(context);
- }
- }
-
- // Increment the ref count on reader. We need to track the number of threads per
- // reader per disk that are in the unlocked hdfs read code section. This is updated
- // by multiple threads without a lock so we need to use an atomic int.
- void increment_request_thread_and_dequeue() {
- ++_num_threads_in_op;
- _is_on_queue = false;
- }
-
- void decrement_request_thread() { --_num_threads_in_op; }
-
- // Decrement request thread count and do final cleanup if this is the last
- // thread. RequestContext lock must be taken before this.
- void decrement_request_thread_and_check_done(RequestContext* context) {
- --_num_threads_in_op;
- // We don't need to worry about reordered loads here because updating
- // _num_threads_in_request uses an atomic, which is a barrier.
- if (!_is_on_queue && _num_threads_in_op == 0 && !_done) {
- // This thread is the last one for this reader on this disk, do final cleanup
- context->decrement_disk_ref_count();
- _done = true;
- }
- }
-
- void reset() {
- DCHECK(_in_flight_ranges.empty());
- DCHECK(_unstarted_scan_ranges.empty());
- DCHECK(_unstarted_write_ranges.empty());
-
- _done = true;
- _num_remaining_ranges = 0;
- _is_on_queue = false;
- _num_threads_in_op = 0;
- _next_scan_range_to_start = nullptr;
- }
-
- private:
- // If true, this disk is all done for this request context, including any cleanup.
- // If done is true, it means that this request must not be on this disk's queue
- // *AND* there are no threads currently working on this context. To satisfy
- // this, only the last thread (per disk) can set this to true.
- bool _done;
-
- // For each disk, keeps track if the context is on this disk's queue, indicating
- // the disk must do some work for this context. The disk needs to do work in 4 cases:
- // 1) in_flight_ranges is not empty, the disk needs to read for this reader.
- // 2) next_range_to_start is nullptr, the disk needs to prepare a scan range to be
- // read next.
- // 3) the reader has been cancelled and this disk needs to participate in the
- // cleanup.
- // 4) A write range is added to queue.
- // In general, we only want to put a context on the disk queue if there is something
- // useful that can be done. If there's nothing useful, the disk queue will wake up
- // and then remove the reader from the queue. Doing this causes thrashing of the
- // threads.
- bool _is_on_queue;
-
- // For each disks, the number of request ranges that have not been fully read.
- // In the non-cancellation path, this will hit 0, and done will be set to true
- // by the disk thread. This is undefined in the cancellation path (the various
- // threads notice by looking at the RequestContext's _state).
- int _num_remaining_ranges;
-
- // Queue of ranges that have not started being read. This list is exclusive
- // with in_flight_ranges.
- InternalQueue<ScanRange> _unstarted_scan_ranges;
-
- // Queue of pending IO requests for this disk in the order that they will be
- // processed. A ScanRange is added to this queue when it is returned in
- // GetNextRange(), or when it is added with schedule_immediately = true.
- // A WriteRange is added to this queue from _unstarted_write_ranges for each
- // invocation of get_next_request_range() in WorkLoop().
- // The size of this queue is always less than or equal to num_remaining_ranges.
- InternalQueue<RequestRange> _in_flight_ranges;
-
- // The next range to start for this reader on this disk. Each disk (for each reader)
- // picks the next range to start. The range is set here and also added to the
- // _ready_to_start_ranges queue. The reader pulls from the queue in FIFO order,
- // so the ranges from different disks are round-robined. When the range is pulled
- // off the _ready_to_start_ranges queue, it sets this variable to nullptr, so the disk
- // knows to populate it again and add it to _ready_to_start_ranges i.e. it is used
- // as a flag by DiskIoMgr::GetNextScanRange to determine if it needs to add another
- // range to _ready_to_start_ranges.
- ScanRange* _next_scan_range_to_start;
-
- // For each disk, the number of threads issuing the underlying read/write on behalf
- // of this context. There are a few places where we release the context lock, do some
- // work, and then grab the lock again. Because we don't hold the lock for the
- // entire operation, we need this ref count to keep track of which thread should do
- // final resource cleanup during cancellation.
- // Only the thread that sees the count at 0 should do the final cleanup.
- std::atomic<int> _num_threads_in_op {0};
-
- // Queue of write ranges to process for this disk. A write range is always added
- // to _in_flight_ranges in get_next_request_range(). There is a separate
- // _unstarted_read_ranges and _unstarted_write_ranges to alternate between reads
- // and writes. (Otherwise, since next_scan_range_to_start is set
- // in get_next_request_range() whenever it is null, repeated calls to
- // get_next_request_range() and GetNextRange() may result in only reads being processed)
- InternalQueue<WriteRange> _unstarted_write_ranges;
- };
-
- // Per disk states to synchronize multiple disk threads accessing the same request
- // context.
- std::vector<PerDiskState> _disk_states;
-};
-
-} // namespace doris
diff --git a/be/src/runtime/disk_io_mgr_reader_context.cc b/be/src/runtime/disk_io_mgr_reader_context.cc
deleted file mode 100644
index 4ef71f1c94..0000000000
--- a/be/src/runtime/disk_io_mgr_reader_context.cc
+++ /dev/null
@@ -1,322 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/disk-io-mgr-reader-context.cc
-// and modified by Doris
-
-#include "runtime/disk_io_mgr_internal.h"
-
-namespace doris {
-
-using std::string;
-using std::stringstream;
-using std::vector;
-using std::list;
-using std::endl;
-
-using std::lock_guard;
-using std::unique_lock;
-using std::mutex;
-
-void DiskIoMgr::RequestContext::cancel(const Status& status) {
- DCHECK(!status.ok());
-
- // Callbacks are collected in this vector and invoked while no lock is held.
- vector<WriteRange::WriteDoneCallback> write_callbacks;
- {
- lock_guard<mutex> lock(_lock);
- DCHECK(validate()) << endl << debug_string();
-
- // Already being cancelled
- if (_state == RequestContext::Cancelled) {
- return;
- }
-
- DCHECK(_status.ok());
- _status = status;
-
- // The reader will be put into a cancelled state until call cleanup is complete.
- _state = RequestContext::Cancelled;
-
- // Cancel all scan ranges for this reader. Each range could be one one of
- // four queues.
- for (int i = 0; i < _disk_states.size(); ++i) {
- RequestContext::PerDiskState& state = _disk_states[i];
- RequestRange* range = nullptr;
- while ((range = state.in_flight_ranges()->dequeue()) != nullptr) {
- if (range->request_type() == RequestType::READ) {
- static_cast<ScanRange*>(range)->cancel(status);
- } else {
- DCHECK(range->request_type() == RequestType::WRITE);
- write_callbacks.push_back(static_cast<WriteRange*>(range)->_callback);
- }
- }
-
- ScanRange* scan_range = nullptr;
- while ((scan_range = state.unstarted_scan_ranges()->dequeue()) != nullptr) {
- scan_range->cancel(status);
- }
- WriteRange* write_range = nullptr;
- while ((write_range = state.unstarted_write_ranges()->dequeue()) != nullptr) {
- write_callbacks.push_back(write_range->_callback);
- }
- }
-
- ScanRange* range = nullptr;
- while ((range = _ready_to_start_ranges.dequeue()) != nullptr) {
- range->cancel(status);
- }
- while ((range = _blocked_ranges.dequeue()) != nullptr) {
- range->cancel(status);
- }
- while ((range = _cached_ranges.dequeue()) != nullptr) {
- range->cancel(status);
- }
-
- // Schedule reader on all disks. The disks will notice it is cancelled and do any
- // required cleanup
- for (int i = 0; i < _disk_states.size(); ++i) {
- RequestContext::PerDiskState& state = _disk_states[i];
- state.schedule_context(this, i);
- }
- }
-
- for (const WriteRange::WriteDoneCallback& write_callback : write_callbacks) {
- write_callback(_status);
- }
-
- // Signal reader and unblock the get_next/Read thread. That read will fail with
- // a cancelled status.
- _ready_to_start_ranges_cv.notify_all();
-}
-
-void DiskIoMgr::RequestContext::add_request_range(DiskIoMgr::RequestRange* range,
- bool schedule_immediately) {
- // DCHECK(_lock.is_locked()); // TODO: boost should have this API
- RequestContext::PerDiskState& state = _disk_states[range->disk_id()];
- if (state.done()) {
- DCHECK_EQ(state.num_remaining_ranges(), 0);
- state.set_done(false);
- ++_num_disks_with_ranges;
- }
-
- bool schedule_context = false;
- if (range->request_type() == RequestType::READ) {
- DiskIoMgr::ScanRange* scan_range = static_cast<DiskIoMgr::ScanRange*>(range);
- if (schedule_immediately) {
- schedule_scan_range(scan_range);
- } else {
- state.unstarted_scan_ranges()->enqueue(scan_range);
- ++_num_unstarted_scan_ranges;
- }
- // If next_scan_range_to_start is nullptr, schedule this RequestContext so that it will
- // be set. If it's not nullptr, this context will be scheduled when GetNextRange() is
- // invoked.
- schedule_context = state.next_scan_range_to_start() == nullptr;
- } else {
- DCHECK(range->request_type() == RequestType::WRITE);
- DCHECK(!schedule_immediately);
- DiskIoMgr::WriteRange* write_range = static_cast<DiskIoMgr::WriteRange*>(range);
- state.unstarted_write_ranges()->enqueue(write_range);
-
- // schedule_context() has no effect if the context is already scheduled,
- // so this is safe.
- schedule_context = true;
- }
-
- if (schedule_context) {
- state.schedule_context(this, range->disk_id());
- }
- ++state.num_remaining_ranges();
-}
-
-DiskIoMgr::RequestContext::RequestContext(DiskIoMgr* parent, int num_disks)
- : _parent(parent),
- _bytes_read_counter(nullptr),
- _read_timer(nullptr),
- _active_read_thread_counter(nullptr),
- _disks_accessed_bitmap(nullptr),
- _state(Inactive),
- _disk_states(num_disks) {}
-
-// Resets this object.
-void DiskIoMgr::RequestContext::reset() {
- DCHECK_EQ(_state, Inactive);
- _status = Status::OK();
-
- _bytes_read_counter = nullptr;
- _read_timer = nullptr;
- _active_read_thread_counter = nullptr;
- _disks_accessed_bitmap = nullptr;
-
- _state = Active;
-
- _num_unstarted_scan_ranges = 0;
- _num_disks_with_ranges = 0;
- _num_used_buffers = 0;
- _num_buffers_in_reader = 0;
- _num_ready_buffers = 0;
- _total_range_queue_capacity = 0;
- _num_finished_ranges = 0;
- _num_remote_ranges = 0;
- _bytes_read_local = 0;
- _bytes_read_short_circuit = 0;
- _bytes_read_dn_cache = 0;
- _unexpected_remote_bytes = 0;
- _initial_queue_capacity = DiskIoMgr::DEFAULT_QUEUE_CAPACITY;
-
- DCHECK(_ready_to_start_ranges.empty());
- DCHECK(_blocked_ranges.empty());
- DCHECK(_cached_ranges.empty());
-
- for (int i = 0; i < _disk_states.size(); ++i) {
- _disk_states[i].reset();
- }
-}
-
-// Dumps out request context information. Lock should be taken by caller
-string DiskIoMgr::RequestContext::debug_string() const {
- stringstream ss;
- ss << endl << " RequestContext: " << (void*)this << " (state=";
- if (_state == RequestContext::Inactive) {
- ss << "Inactive";
- }
- if (_state == RequestContext::Cancelled) ss << "Cancelled";
- if (_state == RequestContext::Active) ss << "Active";
- if (_state != RequestContext::Inactive) {
- ss << " _status=" << _status << " #ready_buffers=" << _num_ready_buffers
- << " #used_buffers=" << _num_used_buffers
- << " #num_buffers_in_reader=" << _num_buffers_in_reader
- << " #finished_scan_ranges=" << _num_finished_ranges
- << " #disk_with_ranges=" << _num_disks_with_ranges
- << " #disks=" << _num_disks_with_ranges;
- for (int i = 0; i < _disk_states.size(); ++i) {
- ss << endl
- << " " << i << ": "
- << "is_on_queue=" << _disk_states[i].is_on_queue()
- << " done=" << _disk_states[i].done()
- << " #num_remaining_scan_ranges=" << _disk_states[i].num_remaining_ranges()
- << " #in_flight_ranges=" << _disk_states[i].in_flight_ranges()->size()
- << " #unstarted_scan_ranges=" << _disk_states[i].unstarted_scan_ranges()->size()
- << " #unstarted_write_ranges=" << _disk_states[i].unstarted_write_ranges()->size()
- << " #reading_threads=" << _disk_states[i].num_threads_in_op();
- }
- }
- ss << ")";
- return ss.str();
-}
-
-bool DiskIoMgr::RequestContext::validate() const {
- if (_state == RequestContext::Inactive) {
- LOG(WARNING) << "_state == RequestContext::Inactive";
- return false;
- }
-
- if (_num_used_buffers < 0) {
- LOG(WARNING) << "_num_used_buffers < 0: #used=" << _num_used_buffers;
- return false;
- }
-
- if (_num_ready_buffers < 0) {
- LOG(WARNING) << "_num_ready_buffers < 0: #used=" << _num_ready_buffers;
- return false;
- }
-
- int total_unstarted_ranges = 0;
- for (int i = 0; i < _disk_states.size(); ++i) {
- const PerDiskState& state = _disk_states[i];
- bool on_queue = state.is_on_queue();
- int num_reading_threads = state.num_threads_in_op();
-
- total_unstarted_ranges += state.unstarted_scan_ranges()->size();
-
- if (num_reading_threads < 0) {
- LOG(WARNING) << "disk_id=" << i
- << "state.num_threads_in_read < 0: #threads=" << num_reading_threads;
- return false;
- }
-
- if (_state != RequestContext::Cancelled) {
- if (state.unstarted_scan_ranges()->size() + state.in_flight_ranges()->size() >
- state.num_remaining_ranges()) {
- LOG(WARNING) << "disk_id=" << i
- << " state.unstarted_ranges.size() + state.in_flight_ranges.size()"
- << " > state.num_remaining_ranges:"
- << " #unscheduled=" << state.unstarted_scan_ranges()->size()
- << " #in_flight=" << state.in_flight_ranges()->size()
- << " #remaining=" << state.num_remaining_ranges();
- return false;
- }
-
- // If we have an in_flight range, the reader must be on the queue or have a
- // thread actively reading for it.
- if (!state.in_flight_ranges()->empty() && !on_queue && num_reading_threads == 0) {
- LOG(WARNING) << "disk_id=" << i
- << " reader has inflight ranges but is not on the disk queue."
- << " #in_flight_ranges=" << state.in_flight_ranges()->size()
- << " #reading_threads=" << num_reading_threads
- << " on_queue=" << on_queue;
- return false;
- }
-
- if (state.done() && num_reading_threads > 0) {
- LOG(WARNING) << "disk_id=" << i
- << " state set to done but there are still threads working."
- << " #reading_threads=" << num_reading_threads;
- return false;
- }
- } else {
- // Is Cancelled
- if (!state.in_flight_ranges()->empty()) {
- LOG(WARNING) << "disk_id=" << i << "Reader cancelled but has in flight ranges.";
- return false;
- }
- if (!state.unstarted_scan_ranges()->empty()) {
- LOG(WARNING) << "disk_id=" << i << "Reader cancelled but has unstarted ranges.";
- return false;
- }
- }
-
- if (state.done() && on_queue) {
- LOG(WARNING) << "disk_id=" << i
- << " state set to done but the reader is still on the disk queue."
- << " state.done=true and state.is_on_queue=true";
- return false;
- }
- }
-
- if (_state != RequestContext::Cancelled) {
- if (total_unstarted_ranges != _num_unstarted_scan_ranges) {
- LOG(WARNING) << "total_unstarted_ranges=" << total_unstarted_ranges
- << " sum_in_states=" << _num_unstarted_scan_ranges;
- return false;
- }
- } else {
- if (!_ready_to_start_ranges.empty()) {
- LOG(WARNING) << "Reader cancelled but has ready to start ranges.";
- return false;
- }
- if (!_blocked_ranges.empty()) {
- LOG(WARNING) << "Reader cancelled but has blocked ranges.";
- return false;
- }
- }
-
- return true;
-}
-
-} // namespace doris
diff --git a/be/src/runtime/disk_io_mgr_scan_range.cc b/be/src/runtime/disk_io_mgr_scan_range.cc
deleted file mode 100644
index 82962f4b3d..0000000000
--- a/be/src/runtime/disk_io_mgr_scan_range.cc
+++ /dev/null
@@ -1,481 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/disk-io-mgr-scan-range.cc
-// and modified by Doris
-
-#include "runtime/disk_io_mgr.h"
-#include "runtime/disk_io_mgr_internal.h"
-#include "util/error_util.h"
-
-using std::string;
-using std::stringstream;
-using std::vector;
-using std::list;
-using std::endl;
-
-using std::lock_guard;
-using std::unique_lock;
-using std::mutex;
-
-namespace doris {
-
-// A very large max value to prevent things from going out of control. Not
-// expected to ever hit this value (1GB of buffered data per range).
-const int MAX_QUEUE_CAPACITY = 128;
-const int MIN_QUEUE_CAPACITY = 2;
-
-// Implementation of the ScanRange functionality. Each ScanRange contains a queue
-// of ready buffers. For each ScanRange, there is only a single producer and
-// consumer thread, i.e. only one disk thread will push to a scan range at
-// any time and only one thread will remove from the queue. This is to guarantee
-// that buffers are queued and read in file order.
-
-// This must be called with the reader lock taken.
-bool DiskIoMgr::ScanRange::enqueue_buffer(BufferDescriptor* buffer) {
- {
- unique_lock<mutex> scan_range_lock(_lock);
- DCHECK(validate()) << debug_string();
- DCHECK(!_eosr_returned);
- DCHECK(!_eosr_queued);
- if (_is_cancelled) {
- // Return the buffer, this range has been cancelled
- if (buffer->_buffer != nullptr) {
- ++_io_mgr->_num_buffers_in_readers;
- ++_reader->_num_buffers_in_reader;
- }
- --_reader->_num_used_buffers;
- buffer->return_buffer();
- return false;
- }
- ++_reader->_num_ready_buffers;
- _ready_buffers.push_back(buffer);
- _eosr_queued = buffer->eosr();
-
- _blocked_on_queue = _ready_buffers.size() >= _ready_buffers_capacity;
- if (_blocked_on_queue && _ready_buffers_capacity > MIN_QUEUE_CAPACITY) {
- // We have filled the queue, indicating we need back pressure on
- // the producer side (i.e. we are pushing buffers faster than they
- // are pulled off, throttle this range more).
- --_ready_buffers_capacity;
- }
- }
-
- _buffer_ready_cv.notify_one();
-
- return _blocked_on_queue;
-}
-
-Status DiskIoMgr::ScanRange::get_next(BufferDescriptor** buffer) {
- *buffer = nullptr;
-
- {
- unique_lock<mutex> scan_range_lock(_lock);
- if (_eosr_returned) {
- return Status::OK();
- }
- DCHECK(validate()) << debug_string();
-
- if (_ready_buffers.empty()) {
- // The queue is empty indicating this thread could use more
- // IO. Increase the capacity to allow for more queueing.
- ++_ready_buffers_capacity;
- _ready_buffers_capacity = std::min(_ready_buffers_capacity, MAX_QUEUE_CAPACITY);
- }
-
- while (_ready_buffers.empty() && !_is_cancelled) {
- _buffer_ready_cv.wait(scan_range_lock);
- }
-
- if (_is_cancelled) {
- DCHECK(!_status.ok());
- return _status;
- }
-
- // Remove the first ready buffer from the queue and return it
- DCHECK(!_ready_buffers.empty());
- *buffer = _ready_buffers.front();
- _ready_buffers.pop_front();
- _eosr_returned = (*buffer)->eosr();
- }
-
- // Update tracking counters. The buffer has now moved from the IoMgr to the
- // caller.
- ++_io_mgr->_num_buffers_in_readers;
- ++_reader->_num_buffers_in_reader;
- --_reader->_num_ready_buffers;
- --_reader->_num_used_buffers;
-
- Status status = (*buffer)->_status;
- if (!status.ok()) {
- (*buffer)->return_buffer();
- *buffer = nullptr;
- return status;
- }
-
- unique_lock<mutex> reader_lock(_reader->_lock);
- if (_eosr_returned) {
- _reader->_total_range_queue_capacity += _ready_buffers_capacity;
- ++_reader->_num_finished_ranges;
- _reader->_initial_queue_capacity =
- _reader->_total_range_queue_capacity / _reader->_num_finished_ranges;
- }
-
- DCHECK(_reader->validate()) << endl << _reader->debug_string();
- if (_reader->_state == RequestContext::Cancelled) {
- _reader->_blocked_ranges.remove(this);
- cancel(_reader->_status);
- (*buffer)->return_buffer();
- *buffer = nullptr;
- return _status;
- }
-
- bool was_blocked = _blocked_on_queue;
- _blocked_on_queue = _ready_buffers.size() >= _ready_buffers_capacity;
- if (was_blocked && !_blocked_on_queue && !_eosr_queued) {
- // This scan range was blocked and is no longer, add it to the reader
- // queue again.
- _reader->_blocked_ranges.remove(this);
- _reader->schedule_scan_range(this);
- }
- return Status::OK();
-}
-
-void DiskIoMgr::ScanRange::cancel(const Status& status) {
- // Cancelling a range that was never started, ignore.
- if (_io_mgr == nullptr) {
- return;
- }
-
- DCHECK(!status.ok());
- {
- // Grab both locks to make sure that all working threads see _is_cancelled.
- unique_lock<mutex> scan_range_lock(_lock);
- unique_lock<mutex> hdfs_lock(_hdfs_lock);
- DCHECK(validate()) << debug_string();
- if (_is_cancelled) {
- return;
- }
- _is_cancelled = true;
- _status = status;
- }
- _buffer_ready_cv.notify_all();
- cleanup_queued_buffers();
-
- // For cached buffers, we can't close the range until the cached buffer is returned.
- // close() is called from DiskIoMgr::return_buffer().
- if (_cached_buffer == nullptr) {
- close();
- }
-}
-
-void DiskIoMgr::ScanRange::cleanup_queued_buffers() {
- DCHECK(_is_cancelled);
- _io_mgr->_num_buffers_in_readers += _ready_buffers.size();
- _reader->_num_buffers_in_reader += _ready_buffers.size();
- _reader->_num_used_buffers -= _ready_buffers.size();
- _reader->_num_ready_buffers -= _ready_buffers.size();
-
- while (!_ready_buffers.empty()) {
- BufferDescriptor* buffer = _ready_buffers.front();
- buffer->return_buffer();
- _ready_buffers.pop_front();
- }
-}
-
-string DiskIoMgr::ScanRange::debug_string() const {
- stringstream ss;
- ss << "file=" << _file << " disk_id=" << _disk_id << " offset=" << _offset << " len=" << _len
- << " bytes_read=" << _bytes_read << " buffer_queue=" << _ready_buffers.size()
- << " capacity=" << _ready_buffers_capacity << " hdfs_file=" << _hdfs_file;
- return ss.str();
-}
-
-bool DiskIoMgr::ScanRange::validate() {
- if (_bytes_read > _len) {
- LOG(WARNING) << "Bytes read tracking is wrong. Shouldn't read past the scan range."
- << " _bytes_read=" << _bytes_read << " _len=" << _len;
- return false;
- }
- if (_eosr_returned && !_eosr_queued) {
- LOG(WARNING) << "Returned eosr to reader before finishing reading the scan range"
- << " _eosr_returned=" << _eosr_returned << " _eosr_queued=" << _eosr_queued;
- return false;
- }
- return true;
-}
-
-DiskIoMgr::ScanRange::ScanRange(int capacity) : _ready_buffers_capacity(capacity) {
- _request_type = RequestType::READ;
- reset(nullptr, "", -1, -1, -1, false, false, NEVER_CACHE);
-}
-
-DiskIoMgr::ScanRange::~ScanRange() {
- DCHECK(_hdfs_file == nullptr) << "File was not closed.";
- DCHECK(_cached_buffer == nullptr) << "Cached buffer was not released.";
-}
-
-void DiskIoMgr::ScanRange::reset(hdfsFS fs, const char* file, int64_t len, int64_t offset,
- int disk_id, bool try_cache, bool expected_local, int64_t mtime,
- void* meta_data) {
- DCHECK(_ready_buffers.empty());
- _fs = fs;
- _file = file;
- _len = len;
- _offset = offset;
- _disk_id = disk_id;
- _try_cache = try_cache;
- _expected_local = expected_local;
- _meta_data = meta_data;
- _cached_buffer = nullptr;
- _io_mgr = nullptr;
- _reader = nullptr;
- _hdfs_file = nullptr;
- _mtime = mtime;
-}
-
-void DiskIoMgr::ScanRange::init_internal(DiskIoMgr* io_mgr, RequestContext* reader) {
- DCHECK(_hdfs_file == nullptr);
- _io_mgr = io_mgr;
- _reader = reader;
- _local_file = nullptr;
- _hdfs_file = nullptr;
- _bytes_read = 0;
- _is_cancelled = false;
- _eosr_queued = false;
- _eosr_returned = false;
- _blocked_on_queue = false;
- if (_ready_buffers_capacity <= 0) {
- _ready_buffers_capacity = reader->initial_scan_range_queue_capacity();
- DCHECK_GE(_ready_buffers_capacity, MIN_QUEUE_CAPACITY);
- }
- DCHECK(validate()) << debug_string();
-}
-
-Status DiskIoMgr::ScanRange::open() {
- unique_lock<mutex> hdfs_lock(_hdfs_lock);
- if (_is_cancelled) {
- return Status::Cancelled("Cancelled");
- }
-
- // if (_fs != nullptr) {
- // if (_hdfs_file != nullptr) {
- // return Status::OK();
- // }
- // _hdfs_file = _io_mgr->OpenHdfsFile(_fs, file(), mtime());
- // if (_hdfs_file == nullptr) {
- // return Status::InternalError("GetHdfsErrorMsg("Failed to open HDFS file ", _file));
- // }
-
- // if (hdfsSeek(_fs, _hdfs_file->file(), _offset) != 0) {
- // _io_mgr->cache_or_close_file_handle(file(), _hdfs_file, false);
- // _hdfs_file = nullptr;
- // string error_msg = GetHdfsErrorMsg("");
- // stringstream ss;
- // ss << "Error seeking to " << _offset << " in file: " << _file << " " << error_msg;
- // return Status::InternalError(ss.str());
- // }
- // } else {
- if (_local_file != nullptr) {
- return Status::OK();
- }
-
- _local_file = fopen(file(), "r");
- if (_local_file == nullptr) {
- string error_msg = get_str_err_msg();
- return Status::InternalError("Could not open file: {}: {}", _file, error_msg);
- }
- if (fseek(_local_file, _offset, SEEK_SET) == -1) {
- fclose(_local_file);
- _local_file = nullptr;
- string error_msg = get_str_err_msg();
- return Status::InternalError("Could not seek to {} for file: {}: {}", _offset, _file,
- error_msg);
- }
- // }
- return Status::OK();
-}
-
-void DiskIoMgr::ScanRange::close() {
- unique_lock<mutex> hdfs_lock(_hdfs_lock);
- /*
- * if (_fs != nullptr) {
- * if (_hdfs_file == nullptr) return;
- *
- * struct hdfsReadStatistics* stats;
- * if (IsDfsPath(file())) {
- * int success = hdfsFileGetReadStatistics(_hdfs_file->file(), &stats);
- * if (success == 0) {
- * _reader->_bytes_read_local += stats->totalLocalBytesRead;
- * _reader->_bytes_read_short_circuit += stats->totalShortCircuitBytesRead;
- * _reader->_bytes_read_dn_cache += stats->totalZeroCopyBytesRead;
- * if (stats->totalLocalBytesRead != stats->totalBytesRead) {
- * ++_reader->_num_remote_ranges;
- * if (_expected_local) {
- * int remote_bytes = stats->totalBytesRead - stats->totalLocalBytesRead;
- * _reader->_unexpected_remote_bytes += remote_bytes;
- * VLOG_FILE << "Unexpected remote HDFS read of "
- * << PrettyPrinter::Print(remote_bytes, TUnit::BYTES)
- * << " for file '" << _file << "'";
- * }
- * }
- * hdfsFileFreeReadStatistics(stats);
- * }
- * }
- * if (_cached_buffer != nullptr) {
- * hadoopRzBufferFree(_hdfs_file->file(), _cached_buffer);
- * _cached_buffer = nullptr;
- * }
- * _io_mgr->cache_or_close_file_handle(file(), _hdfs_file, false);
- * VLOG_FILE << "Cache HDFS file handle file=" << file();
- * _hdfs_file = nullptr;
- * } else {
- */
- {
- if (_local_file == nullptr) {
- return;
- }
- fclose(_local_file);
- _local_file = nullptr;
- }
-}
-
-/*
- * int64_t DiskIoMgr::ScanRange::max_read_chunk_size() const {
- * // S3 InputStreams don't support DIRECT_READ (i.e. java.nio.ByteBuffer read()
- * // interface). So, hdfsRead() needs to allocate a Java byte[] and copy the data out.
- * // Profiles show that both the JNI array allocation and the memcpy adds much more
- * // overhead for larger buffers, so limit the size of each read request. 128K was
- * // chosen empirically by trying values between 4K and 8M and optimizing for lower CPU
- * // utilization and higher S3 throughput.
- * if (_disk_id == _io_mgr->RemoteS3DiskId()) {
- * DCHECK(IsS3APath(file()));
- * return 128 * 1024;
- * }
- * return numeric_limits<int64_t>::max();
- * }
- */
-
-// TODO: how do we best use the disk here. e.g. is it good to break up a
-// 1MB read into 8 128K reads?
-// TODO: look at linux disk scheduling
-Status DiskIoMgr::ScanRange::read(char* buffer, int64_t* bytes_read, bool* eosr) {
- unique_lock<mutex> hdfs_lock(_hdfs_lock);
- if (_is_cancelled) {
- return Status::Cancelled("Cancelled");
- }
-
- *eosr = false;
- *bytes_read = 0;
- // hdfsRead() length argument is an int. Since _max_buffer_size type is no bigger
- // than an int, this min() will ensure that we don't overflow the length argument.
- DCHECK_LE(sizeof(_io_mgr->_max_buffer_size), sizeof(int));
- int bytes_to_read =
- std::min(static_cast<int64_t>(_io_mgr->_max_buffer_size), _len - _bytes_read);
- DCHECK_GE(bytes_to_read, 0);
-
- /*
- * if (_fs != nullptr) {
- * DCHECK(_hdfs_file != nullptr);
- * int64_t max_chunk_size = max_read_chunk_size();
- * while (*bytes_read < bytes_to_read) {
- * int chunk_size = min(bytes_to_read - *bytes_read, max_chunk_size);
- * int last_read = hdfsRead(_fs, _hdfs_file->file(), buffer + *bytes_read, chunk_size);
- * if (last_read == -1) {
- * return Status::InternalError("GetHdfsErrorMsg("Error reading from HDFS file: ", _file));
- * } else if (last_read == 0) {
- * // No more bytes in the file. The scan range went past the end.
- * *eosr = true;
- * break;
- * }
- * *bytes_read += last_read;
- * }
- * } else {
- */
- DCHECK(_local_file != nullptr);
- *bytes_read = fread(buffer, 1, bytes_to_read, _local_file);
- DCHECK_GE(*bytes_read, 0);
- DCHECK_LE(*bytes_read, bytes_to_read);
- if (*bytes_read < bytes_to_read) {
- if (ferror(_local_file) != 0) {
- string error_msg = get_str_err_msg();
- return Status::InternalError("Error reading from {} at byte offset: {}: {}", _file,
- (_offset + _bytes_read), error_msg);
- } else {
- // On Linux, we should only get partial reads from block devices on error or eof.
- DCHECK(feof(_local_file) != 0);
- *eosr = true;
- }
- }
- // }
- _bytes_read += *bytes_read;
- DCHECK_LE(_bytes_read, _len);
- if (_bytes_read == _len) {
- *eosr = true;
- }
- return Status::OK();
-}
-
-/*
- * Status DiskIoMgr::ScanRange::read_from_cache(bool* read_succeeded) {
- * DCHECK(_try_cache);
- * DCHECK_EQ(_bytes_read, 0);
- * *read_succeeded = false;
- * Status status = open();
- * if (!status.ok()) return status;
- *
- * // Cached reads not supported on local filesystem.
- * if (_fs == nullptr) return Status::OK();
- *
- * {
- * unique_lock<mutex> hdfs_lock(_hdfs_lock);
- * if (_is_cancelled) return Status::Cancelled("Cancelled");
- *
- * DCHECK(_hdfs_file != nullptr);
- * DCHECK(_cached_buffer == nullptr);
- * _cached_buffer = hadoopReadZero(_hdfs_file->file(),
- * _io_mgr->_cached_read_options, len());
- *
- * // Data was not cached, caller will fall back to normal read path.
- * if (_cached_buffer == nullptr) return Status::OK();
- * }
- *
- * // Cached read succeeded.
- * void* buffer = const_cast<void*>(hadoopRzBufferGet(_cached_buffer));
- * int32_t bytes_read = hadoopRzBufferLength(_cached_buffer);
- * // For now, entire the entire block is cached or none of it.
- * // TODO: if HDFS ever changes this, we'll have to handle the case where half
- * // the block is cached.
- * DCHECK_EQ(bytes_read, len());
- *
- * // Create a single buffer desc for the entire scan range and enqueue that.
- * BufferDescriptor* desc = _io_mgr->get_buffer_desc(
- * _reader, this, reinterpret_cast<char*>(buffer), 0);
- * desc->_len = bytes_read;
- * desc->_scan_range_offset = 0;
- * desc->_eosr = true;
- * _bytes_read = bytes_read;
- * enqueue_buffer(desc);
- * if (_reader->_bytes_read_counter != nullptr) {
- * COUNTER_ADD(_reader->_bytes_read_counter, bytes_read);
- * }
- * *read_succeeded = true;
- * ++_reader->_num_used_buffers;
- * return Status::OK();
- * }
- */
-} // namespace doris
diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h
index a4ebf52be6..c47d15d9fa 100644
--- a/be/src/runtime/exec_env.h
+++ b/be/src/runtime/exec_env.h
@@ -40,11 +40,8 @@ class BrokerMgr;
template <class T>
class BrpcClientCache;
-class BufferPool;
class CgroupsMgr;
class DataStreamMgr;
-class DiskIoMgr;
-class EtlJobMgr;
class EvHttpServer;
class ExternalScanContextMgr;
class FragmentMgr;
@@ -153,7 +150,6 @@ public:
ResultCache* result_cache() { return _result_cache; }
TMasterInfo* master_info() { return _master_info; }
LoadPathMgr* load_path_mgr() { return _load_path_mgr; }
- DiskIoMgr* disk_io_mgr() { return _disk_io_mgr; }
TmpFileMgr* tmp_file_mgr() { return _tmp_file_mgr; }
BfdParser* bfd_parser() const { return _bfd_parser; }
BrokerMgr* broker_mgr() const { return _broker_mgr; }
@@ -163,7 +159,6 @@ public:
BrpcClientCache<PFunctionService_Stub>* brpc_function_client_cache() const {
return _function_client_cache;
}
- BufferPool* buffer_pool() { return _buffer_pool; }
LoadChannelMgr* load_channel_mgr() { return _load_channel_mgr; }
LoadStreamMgr* load_stream_mgr() { return _load_stream_mgr; }
NewLoadStreamMgr* new_load_stream_mgr() { return _new_load_stream_mgr; }
@@ -195,8 +190,6 @@ private:
void _destroy();
Status _init_mem_env();
- /// Initialise 'buffer_pool_' with given capacity.
- void _init_buffer_pool(int64_t min_page_len, int64_t capacity, int64_t clean_pages_limit);
void _register_metrics();
void _deregister_metrics();
@@ -237,7 +230,6 @@ private:
ResultCache* _result_cache = nullptr;
TMasterInfo* _master_info = nullptr;
LoadPathMgr* _load_path_mgr = nullptr;
- DiskIoMgr* _disk_io_mgr = nullptr;
TmpFileMgr* _tmp_file_mgr = nullptr;
BfdParser* _bfd_parser = nullptr;
@@ -248,8 +240,6 @@ private:
BrpcClientCache<PBackendService_Stub>* _internal_client_cache = nullptr;
BrpcClientCache<PFunctionService_Stub>* _function_client_cache = nullptr;
- BufferPool* _buffer_pool = nullptr;
-
StorageEngine* _storage_engine = nullptr;
StreamLoadExecutor* _stream_load_executor = nullptr;
diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp
index fcc886bb25..22fa7f9123 100644
--- a/be/src/runtime/exec_env_init.cpp
+++ b/be/src/runtime/exec_env_init.cpp
@@ -28,10 +28,8 @@
#include "pipeline/task_scheduler.h"
#include "runtime/block_spill_manager.h"
#include "runtime/broker_mgr.h"
-#include "runtime/bufferpool/buffer_pool.h"
#include "runtime/cache/result_cache.h"
#include "runtime/client_cache.h"
-#include "runtime/disk_io_mgr.h"
#include "runtime/exec_env.h"
#include "runtime/external_scan_context_mgr.h"
#include "runtime/fold_constant_executor.h"
@@ -115,7 +113,6 @@ Status ExecEnv::_init(const std::vector<StorePath>& store_paths) {
config::query_cache_elasticity_size_mb);
_master_info = new TMasterInfo();
_load_path_mgr = new LoadPathMgr(this);
- _disk_io_mgr = new DiskIoMgr();
_tmp_file_mgr = new TmpFileMgr(this);
_bfd_parser = BfdParser::create();
_broker_mgr = new BrokerMgr(this);
@@ -186,48 +183,11 @@ Status ExecEnv::_init_mem_env() {
return Status::InternalError(ss.str());
}
- int64_t buffer_pool_limit = ParseUtil::parse_mem_spec(
- config::buffer_pool_limit, MemInfo::mem_limit(), MemInfo::physical_mem(), &is_percent);
- if (buffer_pool_limit <= 0) {
- ss << "Invalid config buffer_pool_limit value, must be a percentage or "
- "positive bytes value or percentage: "
- << config::buffer_pool_limit;
- return Status::InternalError(ss.str());
- }
- buffer_pool_limit = BitUtil::RoundDown(buffer_pool_limit, config::min_buffer_size);
- while (!is_percent && buffer_pool_limit > MemInfo::mem_limit() / 2) {
- // If buffer_pool_limit is not a percentage, and the value exceeds 50% of the total memory limit,
- // it is forced to be reduced to less than 50% of the total memory limit.
- // This is to ensure compatibility. In principle, buffer_pool_limit should be set as a percentage.
- buffer_pool_limit = buffer_pool_limit / 2;
- }
-
- int64_t clean_pages_limit =
- ParseUtil::parse_mem_spec(config::buffer_pool_clean_pages_limit, buffer_pool_limit,
- MemInfo::physical_mem(), &is_percent);
- if (clean_pages_limit <= 0) {
- ss << "Invalid buffer_pool_clean_pages_limit value, must be a percentage or "
- "positive bytes value or percentage: "
- << config::buffer_pool_clean_pages_limit;
- return Status::InternalError(ss.str());
- }
- while (!is_percent && clean_pages_limit > buffer_pool_limit / 2) {
- // Reason same as buffer_pool_limit
- clean_pages_limit = clean_pages_limit / 2;
- }
- _init_buffer_pool(config::min_buffer_size, buffer_pool_limit, clean_pages_limit);
- LOG(INFO) << "Buffer pool memory limit: "
- << PrettyPrinter::print(buffer_pool_limit, TUnit::BYTES)
- << ", origin config value: " << config::buffer_pool_limit
- << ". clean pages limit: " << PrettyPrinter::print(clean_pages_limit, TUnit::BYTES)
- << ", origin config value: " << config::buffer_pool_clean_pages_limit;
-
// 3. init storage page cache
int64_t storage_cache_limit =
ParseUtil::parse_mem_spec(config::storage_page_cache_limit, MemInfo::mem_limit(),
MemInfo::physical_mem(), &is_percent);
while (!is_percent && storage_cache_limit > MemInfo::mem_limit() / 2) {
- // Reason same as buffer_pool_limit
storage_cache_limit = storage_cache_limit / 2;
}
int32_t index_percentage = config::index_page_cache_percentage;
@@ -254,7 +214,6 @@ Status ExecEnv::_init_mem_env() {
SegmentLoader::create_global_instance(segment_cache_capacity);
// 4. init other managers
- RETURN_IF_ERROR(_disk_io_mgr->init(MemInfo::mem_limit()));
RETURN_IF_ERROR(_tmp_file_mgr->init());
RETURN_IF_ERROR(_block_spill_mgr->init());
@@ -277,12 +236,6 @@ Status ExecEnv::_init_mem_env() {
return Status::OK();
}
-void ExecEnv::_init_buffer_pool(int64_t min_page_size, int64_t capacity,
- int64_t clean_pages_limit) {
- DCHECK(_buffer_pool == nullptr);
- _buffer_pool = new BufferPool(min_page_size, capacity, clean_pages_limit);
-}
-
void ExecEnv::init_download_cache_buf() {
std::unique_ptr<char[]> download_cache_buf(new char[config::download_cache_buffer_size]);
memset(download_cache_buf.get(), 0, config::download_cache_buffer_size);
@@ -335,7 +288,6 @@ void ExecEnv::_destroy() {
SAFE_DELETE(_broker_mgr);
SAFE_DELETE(_bfd_parser);
SAFE_DELETE(_tmp_file_mgr);
- SAFE_DELETE(_disk_io_mgr);
SAFE_DELETE(_load_path_mgr);
SAFE_DELETE(_master_info);
SAFE_DELETE(_fragment_mgr);
diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h
index 20a25b1493..ee22760ab1 100644
--- a/be/src/runtime/runtime_state.h
+++ b/be/src/runtime/runtime_state.h
@@ -43,7 +43,6 @@ class DateTimeValue;
class MemTracker;
class DataStreamRecvr;
class ResultBufferMgr;
-class DiskIoMgrs;
class TmpFileMgr;
class BufferedBlockMgr;
class BufferedBlockMgr2;
diff --git a/be/src/util/filesystem_util.h b/be/src/util/filesystem_util.h
index ee5adbcf99..ac29295e3e 100644
--- a/be/src/util/filesystem_util.h
+++ b/be/src/util/filesystem_util.h
@@ -25,8 +25,8 @@
namespace doris {
// Utility class for common local file system operations such as file creation and
-// deletion. This class should NOT be used to read or write data (DiskIoMgr is used
-// for that). Errors are indicated by the status code RUNTIME_ERROR, and are not
+// deletion. This class should NOT be used to read or write data
+// Errors are indicated by the status code RUNTIME_ERROR, and are not
// handled via exceptions.
class FileSystemUtil {
public:
diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt
index 6c59e91d46..28f41c081f 100644
--- a/be/test/CMakeLists.txt
+++ b/be/test/CMakeLists.txt
@@ -159,7 +159,6 @@ set(RUNTIME_TEST_FILES
# runtime/dpp_sink_test.cpp
# runtime/data_spliter_test.cpp
# runtime/tmp_file_mgr_test.cpp
- # runtime/disk_io_mgr_test.cpp
# runtime/thread_resource_mgr_test.cpp
# runtime/export_task_mgr_test.cpp
runtime/mem_pool_test.cpp
diff --git a/be/test/runtime/disk_io_mgr_test.cpp b/be/test/runtime/disk_io_mgr_test.cpp
deleted file mode 100644
index 4b5666a240..0000000000
--- a/be/test/runtime/disk_io_mgr_test.cpp
+++ /dev/null
@@ -1,1069 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/disk_io_mgr.h"
-
-#include <gtest/gtest.h>
-#include <sched.h>
-#include <sys/stat.h>
-
-#include <functional>
-#include <thread>
-
-#include "util/cpu_info.h"
-#include "util/disk_info.h"
-#include "util/thread_group.h"
-
-using std::string;
-using std::stringstream;
-using std::vector;
-using std::list;
-
-using std::lock_guard;
-using std::unique_lock;
-using std::mutex;
-using std::mem_fn;
-using std::condition_variable;
-using std::unique_ptr;
-using std::thread;
-
-namespace doris {
-
-const int MIN_BUFFER_SIZE = 512;
-const int MAX_BUFFER_SIZE = 1024;
-const int LARGE_MEM_LIMIT = 1024 * 1024 * 1024;
-
-class DiskIoMgrTest : public testing::Test {
-public:
- void write_validate_callback(int num_writes, DiskIoMgr::WriteRange** written_range,
- DiskIoMgr* io_mgr, DiskIoMgr::RequestContext* reader,
- int32_t* data, Status expected_status, const Status& status) {
- if (expected_status.code() == E_CANCELLED) {
- EXPECT_TRUE(status.ok() || status.is<E_CANCELLED>());
- } else {
- EXPECT_TRUE(status.code() == expected_status.code());
- }
- if (status.ok()) {
- DiskIoMgr::ScanRange* scan_range = _pool->add(new DiskIoMgr::ScanRange());
- scan_range->reset(nullptr, (*written_range)->file(), (*written_range)->len(),
- (*written_range)->offset(), 0, false, false,
- DiskIoMgr::ScanRange::NEVER_CACHE);
- validate_sync_read(io_mgr, reader, scan_range, reinterpret_cast<const char*>(data),
- sizeof(int32_t));
- }
-
- {
- lock_guard<mutex> l(_written_mutex);
- ++_num_ranges_written;
- if (_num_ranges_written == num_writes) {
- _writes_done.notify_one();
- }
- }
- }
-
- void write_complete_callback(int num_writes, const Status& status) {
- EXPECT_TRUE(status.ok());
- {
- lock_guard<mutex> l(_written_mutex);
- ++_num_ranges_written;
- if (_num_ranges_written == num_writes) {
- _writes_done.notify_all();
- }
- }
- }
-
-protected:
- void CreateTempFile(const char* filename, const char* data) {
- FILE* file = fopen(filename, "w");
- EXPECT_TRUE(file != nullptr);
- fwrite(data, 1, strlen(data), file);
- fclose(file);
- }
-
- int CreateTempFile(const char* filename, int file_size) {
- FILE* file = fopen(filename, "w");
- EXPECT_TRUE(file != nullptr);
- int success = fclose(file);
- if (success != 0) {
- LOG(ERROR) << "Error closing file " << filename;
- return success;
- }
- return truncate(filename, file_size);
- }
-
- // Validates that buffer[i] is \0 or expected[i]
- static void validate_empty_or_correct(const char* expected, const char* buffer, int len) {
- for (int i = 0; i < len; ++i) {
- if (buffer[i] != '\0') {
- EXPECT_EQ(expected[i], buffer[i]) << (int)expected[i] << " != " << (int)buffer[i];
- }
- }
- }
-
- static void validate_sync_read(DiskIoMgr* io_mgr, DiskIoMgr::RequestContext* reader,
- DiskIoMgr::ScanRange* range, const char* expected,
- int expected_len = -1) {
- DiskIoMgr::BufferDescriptor* buffer;
- Status status = io_mgr->read(reader, range, &buffer);
- EXPECT_TRUE(status.ok());
- EXPECT_TRUE(buffer != nullptr);
- EXPECT_EQ(buffer->len(), range->len());
- if (expected_len < 0) {
- expected_len = strlen(expected);
- }
- int cmp = memcmp(buffer->buffer(), expected, expected_len);
- EXPECT_TRUE(cmp == 0);
- buffer->return_buffer();
- }
-
- static void validate_scan_range(DiskIoMgr::ScanRange* range, const char* expected,
- int expected_len, const Status& expected_status) {
- char result[expected_len + 1];
- memset(result, 0, expected_len + 1);
-
- while (true) {
- DiskIoMgr::BufferDescriptor* buffer = nullptr;
- Status status = range->get_next(&buffer);
- EXPECT_TRUE(status.ok() || status.code() == expected_status.code());
- if (buffer == nullptr || !status.ok()) {
- if (buffer != nullptr) buffer->return_buffer();
- break;
- }
- EXPECT_LE(buffer->len(), expected_len);
- memcpy(result + range->offset() + buffer->scan_range_offset(), buffer->buffer(),
- buffer->len());
- buffer->return_buffer();
- }
- validate_empty_or_correct(expected, result, expected_len);
- }
-
- // Continues pulling scan ranges from the io mgr until they are all done.
- // Updates num_ranges_processed with the number of ranges seen by this thread.
- static void scan_range_thread(DiskIoMgr* io_mgr, DiskIoMgr::RequestContext* reader,
- const char* expected_result, int expected_len,
- const Status& expected_status, int max_ranges,
- std::atomic<int>* num_ranges_processed) {
- int num_ranges = 0;
- while (max_ranges == 0 || num_ranges < max_ranges) {
- DiskIoMgr::ScanRange* range;
- Status status = io_mgr->get_next_range(reader, &range);
- EXPECT_TRUE(status.ok() || status.code() == expected_status.code());
- if (range == nullptr) break;
- validate_scan_range(range, expected_result, expected_len, expected_status);
- ++(*num_ranges_processed);
- ++num_ranges;
- }
- }
-
- DiskIoMgr::ScanRange* init_range(int num_buffers, const char* file_path, int offset, int len,
- int disk_id, int64_t mtime, void* meta_data = nullptr,
- bool is_cached = false) {
- DiskIoMgr::ScanRange* range = _pool->add(new DiskIoMgr::ScanRange(num_buffers));
- range->reset(nullptr, file_path, len, offset, disk_id, is_cached, true, mtime, meta_data);
- EXPECT_EQ(mtime, range->mtime());
- return range;
- }
-
- std::unique_ptr<ObjectPool> _pool;
-
- mutex _written_mutex;
- condition_variable _writes_done;
- int _num_ranges_written;
-};
-
-// Test a single writer with multiple disks and threads per disk. Each WriteRange
-// writes random 4-byte integers, and upon completion, the written data is validated
-// by reading the data back via a separate IoMgr instance. All writes are expected to
-// complete successfully.
-TEST_F(DiskIoMgrTest, SingleWriter) {
- _num_ranges_written = 0;
- string tmp_file = "/tmp/disk_io_mgr_test.txt";
- int num_ranges = 100;
- int64_t file_size = 1024 * 1024;
- int64_t cur_offset = 0;
- int success = CreateTempFile(tmp_file.c_str(), file_size);
- if (success != 0) {
- LOG(ERROR) << "Error creating temp file " << tmp_file.c_str() << " of size " << file_size;
- EXPECT_TRUE(false);
- }
-
- std::unique_ptr<DiskIoMgr> read_io_mgr(new DiskIoMgr(1, 1, 1, 10));
- Status status = read_io_mgr->init(LARGE_MEM_LIMIT);
- EXPECT_TRUE(status.ok());
- DiskIoMgr::RequestContext* reader;
- status = read_io_mgr->register_context(&reader);
- EXPECT_TRUE(status.ok());
- for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
- for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
- _pool.reset(new ObjectPool);
- DiskIoMgr io_mgr(num_disks, num_threads_per_disk, 1, 10);
- status = io_mgr.init(LARGE_MEM_LIMIT);
- EXPECT_TRUE(status.ok());
- DiskIoMgr::RequestContext* writer;
- io_mgr.register_context(&writer);
- for (int i = 0; i < num_ranges; ++i) {
- int32_t* data = _pool->add(new int32_t);
- *data = rand();
- DiskIoMgr::WriteRange** new_range = _pool->add(new DiskIoMgr::WriteRange*);
- DiskIoMgr::WriteRange::WriteDoneCallback callback =
- bind(mem_fn(&DiskIoMgrTest::write_validate_callback), this, num_ranges,
- new_range, read_io_mgr.get(), reader, data, Status::OK(), _1);
- *new_range = _pool->add(new DiskIoMgr::WriteRange(
- tmp_file, cur_offset, num_ranges % num_disks, callback));
- (*new_range)->set_data(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
- Status add_status = io_mgr.add_write_range(writer, *new_range);
- EXPECT_TRUE(add_status.ok());
- cur_offset += sizeof(int32_t);
- }
-
- {
- unique_lock<mutex> lock(_written_mutex);
- while (_num_ranges_written < num_ranges) {
- _writes_done.wait(lock);
- }
- }
- _num_ranges_written = 0;
- io_mgr.unregister_context(writer);
- }
- }
-
- read_io_mgr->unregister_context(reader);
- read_io_mgr.reset();
-}
-
-// Perform invalid writes (e.g. non-existent file, negative offset) and validate
-// that an error status is returned via the write callback.
-TEST_F(DiskIoMgrTest, InvalidWrite) {
- _num_ranges_written = 0;
- string tmp_file = "/tmp/non-existent.txt";
- DiskIoMgr io_mgr(1, 1, 1, 10);
- Status status = io_mgr.init(LARGE_MEM_LIMIT);
- EXPECT_TRUE(status.ok());
- DiskIoMgr::RequestContext* writer;
- status = io_mgr.register_context(&writer);
- _pool.reset(new ObjectPool);
- int32_t* data = _pool->add(new int32_t);
- *data = rand();
-
- // Write to a non-existent file.
- DiskIoMgr::WriteRange** new_range = _pool->add(new DiskIoMgr::WriteRange*);
- DiskIoMgr::WriteRange::WriteDoneCallback callback =
- bind(mem_fn(&DiskIoMgrTest::write_validate_callback), this, 2, new_range,
- (DiskIoMgr*)nullptr, (DiskIoMgr::RequestContext*)nullptr, data,
- Status::InternalError("Test Failure"), _1);
- *new_range = _pool->add(new DiskIoMgr::WriteRange(tmp_file, rand(), 0, callback));
-
- (*new_range)->set_data(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
- status = io_mgr.add_write_range(writer, *new_range);
- EXPECT_TRUE(status.ok());
-
- // Write to a bad location in a file that exists.
- tmp_file = "/tmp/disk_io_mgr_test.txt";
- int success = CreateTempFile(tmp_file.c_str(), 100);
- if (success != 0) {
- LOG(ERROR) << "Error creating temp file " << tmp_file.c_str() << " of size 100";
- EXPECT_TRUE(false);
- }
-
- new_range = _pool->add(new DiskIoMgr::WriteRange*);
- callback = bind(mem_fn(&DiskIoMgrTest::write_validate_callback), this, 2, new_range,
- (DiskIoMgr*)nullptr, (DiskIoMgr::RequestContext*)nullptr, data,
- Status::InternalError("Test Failure"), _1);
-
- *new_range = _pool->add(new DiskIoMgr::WriteRange(tmp_file, -1, 0, callback));
- (*new_range)->set_data(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
- status = io_mgr.add_write_range(writer, *new_range);
- EXPECT_TRUE(status.ok());
-
- {
- unique_lock<mutex> lock(_written_mutex);
- while (_num_ranges_written < 2) {
- _writes_done.wait(lock);
- }
- }
- _num_ranges_written = 0;
- io_mgr.unregister_context(writer);
-}
-
-// Issue a number of writes, cancel the writer context and issue more writes.
-// add_write_range() is expected to succeed before the cancel and fail after it.
-// The writes themselves may finish with status cancelled or ok.
-TEST_F(DiskIoMgrTest, SingleWriterCancel) {
- _num_ranges_written = 0;
- string tmp_file = "/tmp/disk_io_mgr_test.txt";
- int num_ranges = 100;
- int num_ranges_before_cancel = 25;
- int64_t file_size = 1024 * 1024;
- int64_t cur_offset = 0;
- int success = CreateTempFile(tmp_file.c_str(), file_size);
- if (success != 0) {
- LOG(ERROR) << "Error creating temp file " << tmp_file.c_str() << " of size " << file_size;
- EXPECT_TRUE(false);
- }
-
- std::unique_ptr<DiskIoMgr> read_io_mgr(new DiskIoMgr(1, 1, 1, 10));
- Status status = read_io_mgr->init(LARGE_MEM_LIMIT);
- EXPECT_TRUE(status.ok());
- DiskIoMgr::RequestContext* reader;
- status = read_io_mgr->register_context(&reader);
- EXPECT_TRUE(status.ok());
- for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
- for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
- _pool.reset(new ObjectPool);
- DiskIoMgr io_mgr(num_disks, num_threads_per_disk, 1, 10);
- status = io_mgr.init(LARGE_MEM_LIMIT);
- DiskIoMgr::RequestContext* writer;
- io_mgr.register_context(&writer);
- Status validate_status = Status::OK();
- for (int i = 0; i < num_ranges; ++i) {
- if (i == num_ranges_before_cancel) {
- io_mgr.cancel_context(writer);
- validate_status = Status::Cancelled("");
- }
- int32_t* data = _pool->add(new int32_t);
- *data = rand();
- DiskIoMgr::WriteRange** new_range = _pool->add(new DiskIoMgr::WriteRange*);
- DiskIoMgr::WriteRange::WriteDoneCallback callback =
- bind(mem_fn(&DiskIoMgrTest::write_validate_callback), this,
- num_ranges_before_cancel, new_range, read_io_mgr.get(), reader, data,
- Status::Cancelled(""), _1);
- *new_range = _pool->add(new DiskIoMgr::WriteRange(
- tmp_file, cur_offset, num_ranges % num_disks, callback));
- (*new_range)->set_data(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
- cur_offset += sizeof(int32_t);
- Status add_status = io_mgr.add_write_range(writer, *new_range);
- EXPECT_TRUE(add_status.code() == validate_status.code());
- }
-
- {
- unique_lock<mutex> lock(_written_mutex);
- while (_num_ranges_written < num_ranges_before_cancel) {
- _writes_done.wait(lock);
- }
- }
- _num_ranges_written = 0;
- io_mgr.unregister_context(writer);
- }
- }
-
- read_io_mgr->unregister_context(reader);
- read_io_mgr.reset();
-}
-
-// Basic test with a single reader, testing multiple threads, disks and a different
-// number of buffers.
-TEST_F(DiskIoMgrTest, SingleReader) {
- const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
- const char* data = "abcdefghijklm";
- int len = strlen(data);
- CreateTempFile(tmp_file, data);
-
- // Get mtime for file
- struct stat stat_val;
- stat(tmp_file, &stat_val);
-
- int64_t iters = 0;
- for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
- for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
- for (int num_buffers = 1; num_buffers <= 5; ++num_buffers) {
- for (int num_read_threads = 1; num_read_threads <= 5; ++num_read_threads) {
- _pool.reset(new ObjectPool);
- LOG(INFO) << "Starting test with num_threads_per_disk=" << num_threads_per_disk
- << " num_disk=" << num_disks << " num_buffers=" << num_buffers
- << " num_read_threads=" << num_read_threads;
-
- if (++iters % 5000 == 0) {
- LOG(ERROR) << "Starting iteration " << iters;
- }
- DiskIoMgr io_mgr(num_disks, num_threads_per_disk, 1, 1);
-
- Status status = io_mgr.init(LARGE_MEM_LIMIT);
- EXPECT_TRUE(status.ok());
- DiskIoMgr::RequestContext* reader;
- status = io_mgr.register_context(&reader);
- EXPECT_TRUE(status.ok());
-
- std::vector<DiskIoMgr::ScanRange*> ranges;
- for (int i = 0; i < len; ++i) {
- int disk_id = i % num_disks;
- ranges.push_back(init_range(num_buffers, tmp_file, 0, len, disk_id,
- stat_val.st_mtime));
- }
- status = io_mgr.add_scan_ranges(reader, ranges);
- EXPECT_TRUE(status.ok());
-
- std::atomic<int> num_ranges_processed;
- ThreadGroup threads;
- for (int i = 0; i < num_read_threads; ++i) {
- threads.add_thread(new thread(scan_range_thread, &io_mgr, reader, data, len,
- Status::OK(), 0, &num_ranges_processed));
- }
- threads.join_all();
-
- EXPECT_EQ(num_ranges_processed, ranges.size());
- io_mgr.unregister_context(reader);
- }
- }
- }
- }
- EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-
-// This test issues adding additional scan ranges while there are some still in flight.
-TEST_F(DiskIoMgrTest, AddScanRangeTest) {
- const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
- const char* data = "abcdefghijklm";
- int len = strlen(data);
- CreateTempFile(tmp_file, data);
-
- // Get mtime for file
- struct stat stat_val;
- stat(tmp_file, &stat_val);
-
- int64_t iters = 0;
- for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
- for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
- for (int num_buffers = 1; num_buffers <= 5; ++num_buffers) {
- _pool.reset(new ObjectPool);
- LOG(INFO) << "Starting test with num_threads_per_disk=" << num_threads_per_disk
- << " num_disk=" << num_disks << " num_buffers=" << num_buffers;
-
- if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
- DiskIoMgr io_mgr(num_disks, num_threads_per_disk, 1, 1);
-
- Status status = io_mgr.init(LARGE_MEM_LIMIT);
- EXPECT_TRUE(status.ok());
- DiskIoMgr::RequestContext* reader;
- status = io_mgr.register_context(&reader);
- EXPECT_TRUE(status.ok());
-
- std::vector<DiskIoMgr::ScanRange*> ranges_first_half;
- std::vector<DiskIoMgr::ScanRange*> ranges_second_half;
- for (int i = 0; i < len; ++i) {
- int disk_id = i % num_disks;
- if (i > len / 2) {
- ranges_second_half.push_back(init_range(num_buffers, tmp_file, i, 1,
- disk_id, stat_val.st_mtime));
- } else {
- ranges_first_half.push_back(init_range(num_buffers, tmp_file, i, 1, disk_id,
- stat_val.st_mtime));
- }
- }
- std::atomic<int> num_ranges_processed;
-
- // Issue first half the scan ranges.
- status = io_mgr.add_scan_ranges(reader, ranges_first_half);
- EXPECT_TRUE(status.ok());
-
- // Read a couple of them
- scan_range_thread(&io_mgr, reader, data, strlen(data), Status::OK(), 2,
- &num_ranges_processed);
-
- // Issue second half
- status = io_mgr.add_scan_ranges(reader, ranges_second_half);
- EXPECT_TRUE(status.ok());
-
- // Start up some threads and then cancel
- ThreadGroup threads;
- for (int i = 0; i < 3; ++i) {
- threads.add_thread(new thread(scan_range_thread, &io_mgr, reader, data,
- strlen(data), Status::Cancelled(""), 0,
- &num_ranges_processed));
- }
-
- threads.join_all();
- EXPECT_EQ(num_ranges_processed, len);
- io_mgr.unregister_context(reader);
- }
- }
- }
- EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-
-// Test to make sure that sync reads and async reads work together
-// Note: this test is constructed so the number of buffers is greater than the
-// number of scan ranges.
-TEST_F(DiskIoMgrTest, SyncReadTest) {
- const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
- const char* data = "abcdefghijklm";
- int len = strlen(data);
- CreateTempFile(tmp_file, data);
-
- // Get mtime for file
- struct stat stat_val;
- stat(tmp_file, &stat_val);
-
- int64_t iters = 0;
- for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
- for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
- for (int num_buffers = 1; num_buffers <= 5; ++num_buffers) {
- _pool.reset(new ObjectPool);
- LOG(INFO) << "Starting SyncReadTest test with num_threads_per_disk="
- << num_threads_per_disk << " num_disk=" << num_disks
- << " num_buffers=" << num_buffers;
-
- if (++iters % 5000 == 0) {
- LOG(ERROR) << "Starting iteration " << iters;
- }
- DiskIoMgr io_mgr(num_disks, num_threads_per_disk, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
-
- Status status = io_mgr.init(LARGE_MEM_LIMIT);
- EXPECT_TRUE(status.ok());
- DiskIoMgr::RequestContext* reader;
- status = io_mgr.register_context(&reader);
- EXPECT_TRUE(status.ok());
-
- DiskIoMgr::ScanRange* complete_range =
- init_range(1, tmp_file, 0, strlen(data), 0, stat_val.st_mtime);
-
- // Issue some reads before the async ones are issued
- validate_sync_read(&io_mgr, reader, complete_range, data);
- validate_sync_read(&io_mgr, reader, complete_range, data);
-
- std::vector<DiskIoMgr::ScanRange*> ranges;
- for (int i = 0; i < len; ++i) {
- int disk_id = i % num_disks;
- ranges.push_back(
- init_range(num_buffers, tmp_file, 0, len, disk_id, stat_val.st_mtime));
- }
- status = io_mgr.add_scan_ranges(reader, ranges);
- EXPECT_TRUE(status.ok());
-
- std::atomic<int> num_ranges_processed;
- ThreadGroup threads;
- for (int i = 0; i < 5; ++i) {
- threads.add_thread(new thread(scan_range_thread, &io_mgr, reader, data,
- strlen(data), Status::OK(), 0,
- &num_ranges_processed));
- }
-
- // Issue some more sync ranges
- for (int i = 0; i < 5; ++i) {
- sched_yield();
- validate_sync_read(&io_mgr, reader, complete_range, data);
- }
-
- threads.join_all();
-
- validate_sync_read(&io_mgr, reader, complete_range, data);
- validate_sync_read(&io_mgr, reader, complete_range, data);
-
- EXPECT_EQ(num_ranges_processed, ranges.size());
- io_mgr.unregister_context(reader);
- }
- }
- }
- EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-
-// Tests a single reader cancelling half way through scan ranges.
-TEST_F(DiskIoMgrTest, SingleReaderCancel) {
- const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
- const char* data = "abcdefghijklm";
- int len = strlen(data);
- CreateTempFile(tmp_file, data);
-
- // Get mtime for file
- struct stat stat_val;
- stat(tmp_file, &stat_val);
-
- int64_t iters = 0;
- for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
- for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
- for (int num_buffers = 1; num_buffers <= 5; ++num_buffers) {
- _pool.reset(new ObjectPool);
- LOG(INFO) << "Starting test with num_threads_per_disk=" << num_threads_per_disk
- << " num_disk=" << num_disks << " num_buffers=" << num_buffers;
-
- if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
- DiskIoMgr io_mgr(num_disks, num_threads_per_disk, 1, 1);
-
- Status status = io_mgr.init(LARGE_MEM_LIMIT);
- EXPECT_TRUE(status.ok());
- DiskIoMgr::RequestContext* reader;
- status = io_mgr.register_context(&reader);
- EXPECT_TRUE(status.ok());
-
- std::vector<DiskIoMgr::ScanRange*> ranges;
- for (int i = 0; i < len; ++i) {
- int disk_id = i % num_disks;
- ranges.push_back(
- init_range(num_buffers, tmp_file, 0, len, disk_id, stat_val.st_mtime));
- }
- status = io_mgr.add_scan_ranges(reader, ranges);
- EXPECT_TRUE(status.ok());
-
- std::atomic<int> num_ranges_processed;
- int num_successful_ranges = ranges.size() / 2;
- // Read half the ranges
- for (int i = 0; i < num_successful_ranges; ++i) {
- scan_range_thread(&io_mgr, reader, data, strlen(data), Status::OK(), 1,
- &num_ranges_processed);
- }
- EXPECT_EQ(num_ranges_processed, num_successful_ranges);
-
- // Start up some threads and then cancel
- ThreadGroup threads;
- for (int i = 0; i < 3; ++i) {
- threads.add_thread(new thread(scan_range_thread, &io_mgr, reader, data,
- strlen(data), Status::Cancelled(""), 0,
- &num_ranges_processed));
- }
-
- io_mgr.cancel_context(reader);
- sched_yield();
-
- threads.join_all();
- EXPECT_TRUE(io_mgr.context_status(reader).is_cancelled());
- io_mgr.unregister_context(reader);
- }
- }
- }
- EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-
-// Test when the reader goes over the mem limit
-TEST_F(DiskIoMgrTest, MemTrackers) {
- const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
- const char* data = "abcdefghijklm";
- int len = strlen(data);
- CreateTempFile(tmp_file, data);
-
- // Get mtime for file
- struct stat stat_val;
- stat(tmp_file, &stat_val);
-
- const int num_buffers = 25;
- // Give the reader more buffers than the limit
- const int mem_limit_num_buffers = 2;
-
- int64_t iters = 0;
- {
- _pool.reset(new ObjectPool);
- if (++iters % 1000 == 0) {
- LOG(ERROR) << "Starting iteration " << iters;
- }
-
- DiskIoMgr io_mgr(1, 1, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
-
- Status status = io_mgr.init(LARGE_MEM_LIMIT);
- EXPECT_TRUE(status.ok());
- DiskIoMgr::RequestContext* reader;
- status = io_mgr.register_context(&reader);
- EXPECT_TRUE(status.ok());
-
- std::vector<DiskIoMgr::ScanRange*> ranges;
- for (int i = 0; i < num_buffers; ++i) {
- ranges.push_back(init_range(num_buffers, tmp_file, 0, len, 0, stat_val.st_mtime));
- }
- status = io_mgr.add_scan_ranges(reader, ranges);
- EXPECT_TRUE(status.ok());
-
- // Don't return buffers to force memory pressure
- std::vector<DiskIoMgr::BufferDescriptor*> buffers;
-
- std::atomic<int> num_ranges_processed;
- scan_range_thread(&io_mgr, reader, data, strlen(data), Status::MemoryLimitExceeded("Mem"),
- 1, &num_ranges_processed);
-
- char result[strlen(data) + 1];
- // Keep reading new ranges without returning buffers. This forces us
- // to go over the limit eventually.
- while (true) {
- memset(result, 0, strlen(data) + 1);
- DiskIoMgr::ScanRange* range = nullptr;
- status = io_mgr.get_next_range(reader, &range);
- EXPECT_TRUE(status.ok() || status.is_mem_limit_exceeded());
- if (range == nullptr) break;
-
- while (true) {
- DiskIoMgr::BufferDescriptor* buffer = nullptr;
- Status status = range->get_next(&buffer);
- EXPECT_TRUE(status.ok() || status.is_mem_limit_exceeded());
- if (buffer == nullptr) break;
- memcpy(result + range->offset() + buffer->scan_range_offset(), buffer->buffer(),
- buffer->len());
- buffers.push_back(buffer);
- }
- validate_empty_or_correct(data, result, strlen(data));
- }
-
- for (int i = 0; i < buffers.size(); ++i) {
- buffers[i]->return_buffer();
- }
-
- EXPECT_TRUE(io_mgr.context_status(reader).is_mem_limit_exceeded());
- io_mgr.unregister_context(reader);
- }
-}
-#if 0
-// Test when some scan ranges are marked as being cached.
-// Since these files are not in HDFS, the cached path always fails so this
-// only tests the fallback mechanism.
-// TODO: we can fake the cached read path without HDFS
-TEST_F(DiskIoMgrTest, CachedReads) {
- const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
- const char* data = "abcdefghijklm";
- int len = strlen(data);
- CreateTempFile(tmp_file, data);
-
- // Get mtime for file
- struct stat stat_val;
- stat(tmp_file, &stat_val);
-
- const int num_disks = 2;
- const int num_buffers = 3;
-
- int64_t iters = 0;
- {
- _pool.reset(new ObjectPool);
- if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
- DiskIoMgr io_mgr(num_disks, 1, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
-
- Status status = io_mgr.init(LARGE_MEM_LIMIT);
- EXPECT_TRUE(status.ok());
- DiskIoMgr::RequestContext* reader;
- status = io_mgr.register_context(&reader);
- EXPECT_TRUE(status.ok());
-
- DiskIoMgr::ScanRange* complete_range =
- init_range(1, tmp_file, 0, strlen(data), 0, stat_val.st_mtime, nullptr, true);
-
- // Issue some reads before the async ones are issued
- validate_sync_read(&io_mgr, reader, complete_range, data);
- validate_sync_read(&io_mgr, reader, complete_range, data);
-
- std::vector<DiskIoMgr::ScanRange*> ranges;
- for (int i = 0; i < len; ++i) {
- int disk_id = i % num_disks;
- ranges.push_back(init_range(num_buffers, tmp_file, 0, len, disk_id,
- stat_val.st_mtime, nullptr, true));
- }
- status = io_mgr.add_scan_ranges(reader, ranges);
- EXPECT_TRUE(status.ok());
-
- std::atomic<int> num_ranges_processed;
- ThreadGroup threads;
- for (int i = 0; i < 5; ++i) {
- threads.add_thread(new thread(scan_range_thread, &io_mgr, reader, data,
- strlen(data), Status::OK(), 0, &num_ranges_processed));
- }
-
- // Issue some more sync ranges
- for (int i = 0; i < 5; ++i) {
- sched_yield();
- validate_sync_read(&io_mgr, reader, complete_range, data);
- }
-
- threads.join_all();
-
- validate_sync_read(&io_mgr, reader, complete_range, data);
- validate_sync_read(&io_mgr, reader, complete_range, data);
-
- EXPECT_EQ(num_ranges_processed, ranges.size());
- io_mgr.unregister_context(reader);
- }
- EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-#endif // end #if 0
-
-TEST_F(DiskIoMgrTest, MultipleReaderWriter) {
- const int ITERATIONS = 1;
- const char* data = "abcdefghijklmnopqrstuvwxyz";
- const int num_contexts = 5;
- const int file_size = 4 * 1024;
- const int num_writes_queued = 5;
- const int num_reads_queued = 5;
-
- string file_name = "/tmp/disk_io_mgr_test.txt";
- int success = CreateTempFile(file_name.c_str(), file_size);
- if (success != 0) {
- LOG(ERROR) << "Error creating temp file " << file_name.c_str() << " of size " << file_size;
- EXPECT_TRUE(false);
- }
-
- // Get mtime for file
- struct stat stat_val;
- stat(file_name.c_str(), &stat_val);
-
- int64_t iters = 0;
- std::vector<DiskIoMgr::RequestContext*> contexts(num_contexts);
- Status status;
- for (int iteration = 0; iteration < ITERATIONS; ++iteration) {
- for (int threads_per_disk = 1; threads_per_disk <= 5; ++threads_per_disk) {
- for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
- DiskIoMgr io_mgr(num_disks, threads_per_disk, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
- io_mgr.init(LARGE_MEM_LIMIT);
- for (int file_index = 0; file_index < num_contexts; ++file_index) {
- status = io_mgr.register_context(&contexts[file_index]);
- EXPECT_TRUE(status.ok());
- }
- _pool.reset(new ObjectPool);
- int read_offset = 0;
- int write_offset = 0;
- while (read_offset < file_size) {
- for (int context_index = 0; context_index < num_contexts; ++context_index) {
- if (++iters % 5000 == 0) {
- LOG(ERROR) << "Starting iteration " << iters;
- }
- std::atomic<int> num_ranges_processed;
- ThreadGroup threads;
- std::vector<DiskIoMgr::ScanRange*> ranges;
- int num_scan_ranges =
- std::min<int>(num_reads_queued, write_offset - read_offset);
- for (int i = 0; i < num_scan_ranges; ++i) {
- ranges.push_back(init_range(1, file_name.c_str(), read_offset, 1,
- i % num_disks, stat_val.st_mtime));
- threads.add_thread(new thread(
- scan_range_thread, &io_mgr, contexts[context_index],
- reinterpret_cast<const char*>(data +
- (read_offset % strlen(data))),
- 1, Status::OK(), num_scan_ranges, &num_ranges_processed));
- ++read_offset;
- }
-
- _num_ranges_written = 0;
- int num_write_ranges =
- std::min<int>(num_writes_queued, file_size - write_offset);
- for (int i = 0; i < num_write_ranges; ++i) {
- DiskIoMgr::WriteRange::WriteDoneCallback callback =
- bind(mem_fn(&DiskIoMgrTest::write_complete_callback), this,
- num_write_ranges, _1);
- DiskIoMgr::WriteRange* new_range = _pool->add(new DiskIoMgr::WriteRange(
- file_name, write_offset, i % num_disks, callback));
- new_range->set_data(reinterpret_cast<const uint8_t*>(
- data + (write_offset % strlen(data))),
- 1);
- status = io_mgr.add_write_range(contexts[context_index], new_range);
- ++write_offset;
- }
-
- {
- unique_lock<mutex> lock(_written_mutex);
- while (_num_ranges_written < num_write_ranges) {
- _writes_done.wait(lock);
- }
- }
-
- threads.join_all();
- } // for (int context_index
- } // while (read_offset < file_size)
-
- for (int file_index = 0; file_index < num_contexts; ++file_index) {
- io_mgr.unregister_context(contexts[file_index]);
- }
- } // for (int num_disks
- } // for (int threads_per_disk
- } // for (int iteration
-}
-
-// This test will test multiple concurrent reads each reading a different file.
-TEST_F(DiskIoMgrTest, MultipleReader) {
- const int NUM_READERS = 5;
- const int DATA_LEN = 50;
- const int ITERATIONS = 25;
- const int NUM_THREADS_PER_READER = 3;
-
- std::vector<string> file_names;
- std::vector<int64_t> mtimes;
- std::vector<string> data;
- std::vector<DiskIoMgr::RequestContext*> readers;
- std::vector<char*> results;
-
- file_names.resize(NUM_READERS);
- readers.resize(NUM_READERS);
- mtimes.resize(NUM_READERS);
- data.resize(NUM_READERS);
- results.resize(NUM_READERS);
-
- // Initialize data for each reader. The data will be
- // 'abcd...' for reader one, 'bcde...' for reader two (wrapping around at 'z')
- for (int i = 0; i < NUM_READERS; ++i) {
- char buf[DATA_LEN];
- for (int j = 0; j < DATA_LEN; ++j) {
- int c = (j + i) % 26;
- buf[j] = 'a' + c;
- }
- data[i] = string(buf, DATA_LEN);
-
- std::stringstream ss;
- ss << "/tmp/disk_io_mgr_test" << i << ".txt";
- file_names[i] = ss.str();
- CreateTempFile(ss.str().c_str(), data[i].c_str());
-
- // Get mtime for file
- struct stat stat_val;
- stat(file_names[i].c_str(), &stat_val);
- mtimes[i] = stat_val.st_mtime;
-
- results[i] = new char[DATA_LEN + 1];
- memset(results[i], 0, DATA_LEN + 1);
- }
-
- // This exercises concurrency, run the test multiple times
- int64_t iters = 0;
- for (int iteration = 0; iteration < ITERATIONS; ++iteration) {
- for (int threads_per_disk = 1; threads_per_disk <= 5; ++threads_per_disk) {
- for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
- for (int num_buffers = 1; num_buffers <= 5; ++num_buffers) {
- _pool.reset(new ObjectPool);
- LOG(INFO) << "Starting test with num_threads_per_disk=" << threads_per_disk
- << " num_disk=" << num_disks << " num_buffers=" << num_buffers;
- if (++iters % 2500 == 0) LOG(ERROR) << "Starting iteration " << iters;
-
- DiskIoMgr io_mgr(num_disks, threads_per_disk, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
- Status status = io_mgr.init(LARGE_MEM_LIMIT);
- EXPECT_TRUE(status.ok());
-
- for (int i = 0; i < NUM_READERS; ++i) {
- status = io_mgr.register_context(&readers[i], nullptr);
- EXPECT_TRUE(status.ok());
-
- std::vector<DiskIoMgr::ScanRange*> ranges;
- for (int j = 0; j < DATA_LEN; ++j) {
- int disk_id = j % num_disks;
- ranges.push_back(init_range(num_buffers, file_names[i].c_str(), j, 1,
- disk_id, mtimes[i]));
- }
- status = io_mgr.add_scan_ranges(readers[i], ranges);
- EXPECT_TRUE(status.ok());
- }
-
- std::atomic<int> num_ranges_processed;
- ThreadGroup threads;
- for (int i = 0; i < NUM_READERS; ++i) {
- for (int j = 0; j < NUM_THREADS_PER_READER; ++j) {
- threads.add_thread(new thread(scan_range_thread, &io_mgr, readers[i],
- data[i].c_str(), data[i].size(),
- Status::OK(), 0, &num_ranges_processed));
- }
- }
- threads.join_all();
- EXPECT_EQ(num_ranges_processed, DATA_LEN * NUM_READERS);
- for (int i = 0; i < NUM_READERS; ++i) {
- io_mgr.unregister_context(readers[i]);
- }
- }
- }
- }
- }
- EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-
-#if 0
-// Stress test for multiple clients with cancellation
-// TODO: the stress app should be expanded to include sync reads and adding scan
-// ranges in the middle.
-TEST_F(DiskIoMgrTest, StressTest) {
- // Run the test with 5 disks, 5 threads per disk, 10 clients and with cancellation
- DiskIoMgrStress test(5, 5, 10, true);
- test.Run(2); // In seconds
-}
-#endif
-
-TEST_F(DiskIoMgrTest, Buffers) {
- // Test default min/max buffer size
- int min_buffer_size = 1024;
- int max_buffer_size = 8 * 1024 * 1024; // 8 MB
-
- DiskIoMgr io_mgr(1, 1, min_buffer_size, max_buffer_size);
- Status status = io_mgr.init(max_buffer_size * 2);
- EXPECT_TRUE(status.ok());
- EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-
- // buffer length should be rounded up to min buffer size
- int64_t buffer_len = 1;
- char* buf = io_mgr.get_free_buffer(&buffer_len);
- EXPECT_EQ(buffer_len, min_buffer_size);
- EXPECT_EQ(io_mgr._num_allocated_buffers, 1);
- io_mgr.return_free_buffer(buf, buffer_len);
- EXPECT_EQ(io_mgr->mem_tracker()->consumption(), min_buffer_size);
-
- // reuse buffer
- buffer_len = min_buffer_size;
- buf = io_mgr.get_free_buffer(&buffer_len);
- EXPECT_EQ(buffer_len, min_buffer_size);
- EXPECT_EQ(io_mgr._num_allocated_buffers, 1);
- io_mgr.return_free_buffer(buf, buffer_len);
- EXPECT_EQ(io_mgr->mem_tracker()->consumption(), min_buffer_size);
-
- // bump up to next buffer size
- buffer_len = min_buffer_size + 1;
- buf = io_mgr.get_free_buffer(&buffer_len);
- EXPECT_EQ(buffer_len, min_buffer_size * 2);
- EXPECT_EQ(io_mgr._num_allocated_buffers, 2);
- EXPECT_EQ(io_mgr->mem_tracker()->consumption(), min_buffer_size * 3);
-
- // gc unused buffer
- io_mgr.gc_io_buffers();
- EXPECT_EQ(io_mgr._num_allocated_buffers, 1);
- EXPECT_EQ(io_mgr->mem_tracker()->consumption(), min_buffer_size * 2);
-
- io_mgr.return_free_buffer(buf, buffer_len);
-
- // max buffer size
- buffer_len = max_buffer_size;
- buf = io_mgr.get_free_buffer(&buffer_len);
- EXPECT_EQ(buffer_len, max_buffer_size);
- EXPECT_EQ(io_mgr._num_allocated_buffers, 2);
- io_mgr.return_free_buffer(buf, buffer_len);
- EXPECT_EQ(io_mgr->mem_tracker()->consumption(), min_buffer_size * 2 + max_buffer_size);
-
- // gc buffers
- io_mgr.gc_io_buffers();
- EXPECT_EQ(io_mgr._num_allocated_buffers, 0);
- EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-
-// IMPALA-2366: handle partial read where range goes past end of file.
-TEST_F(DiskIoMgrTest, PartialRead) {
- const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
- const char* data = "the quick brown fox jumped over the lazy dog";
- int len = strlen(data);
- int read_len = len + 1000; // Read past end of file.
- CreateTempFile(tmp_file, data);
-
- // Get mtime for file
- struct stat stat_val;
- stat(tmp_file, &stat_val);
-
- _pool.reset(new ObjectPool);
- std::unique_ptr<DiskIoMgr> io_mgr(new DiskIoMgr(1, 1, read_len, read_len));
-
- Status status = io_mgr->init(LARGE_MEM_LIMIT);
- EXPECT_TRUE(status.ok());
- DiskIoMgr::RequestContext* reader;
- status = io_mgr->register_context(&reader);
- EXPECT_TRUE(status.ok());
-
- // We should not read past the end of file.
- DiskIoMgr::ScanRange* range = init_range(1, tmp_file, 0, read_len, 0, stat_val.st_mtime);
- DiskIoMgr::BufferDescriptor* buffer;
- status = io_mgr->read(reader, range, &buffer);
- EXPECT_TRUE(status.ok());
- EXPECT_TRUE(buffer->eosr());
- EXPECT_EQ(len, buffer->len());
- EXPECT_TRUE(memcmp(buffer->buffer(), data, len) == 0);
- buffer->return_buffer();
-
- io_mgr->unregister_context(reader);
- _pool.reset();
- io_mgr.reset();
- EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-
-} // end namespace doris
diff --git a/be/test/runtime/test_env.cc b/be/test/runtime/test_env.cc
index 2c40d4432f..f33b6b5238 100644
--- a/be/test/runtime/test_env.cc
+++ b/be/test/runtime/test_env.cc
@@ -22,7 +22,6 @@
#include <memory>
#include "olap/storage_engine.h"
-#include "runtime/bufferpool/buffer_pool.h"
#include "runtime/fragment_mgr.h"
#include "runtime/result_queue_mgr.h"
#include "util/disk_info.h"
@@ -34,8 +33,6 @@ TestEnv::TestEnv() {
// Some code will use ExecEnv::GetInstance(), so init the global ExecEnv singleton
_exec_env = ExecEnv::GetInstance();
_exec_env->_thread_mgr = new ThreadResourceMgr(2);
- _exec_env->_disk_io_mgr = new DiskIoMgr(1, 1, 1, 10);
- _exec_env->disk_io_mgr()->init(-1);
_exec_env->_result_queue_mgr = new ResultQueueMgr();
// TODO may need rpc support, etc.
}
@@ -50,14 +47,8 @@ void TestEnv::init_tmp_file_mgr(const std::vector<std::string>& tmp_dirs, bool o
DCHECK(st.ok()) << st;
}
-void TestEnv::init_buffer_pool(int64_t min_page_len, int64_t capacity, int64_t clean_pages_limit) {
- _exec_env->_buffer_pool = new BufferPool(min_page_len, capacity, clean_pages_limit);
-}
-
TestEnv::~TestEnv() {
SAFE_DELETE(_exec_env->_result_queue_mgr);
- SAFE_DELETE(_exec_env->_buffer_pool);
- SAFE_DELETE(_exec_env->_disk_io_mgr);
SAFE_DELETE(_exec_env->_thread_mgr);
if (_engine == StorageEngine::_s_instance) {
diff --git a/be/test/runtime/test_env.h b/be/test/runtime/test_env.h
index ea034ebd19..a6baae9d27 100644
--- a/be/test/runtime/test_env.h
+++ b/be/test/runtime/test_env.h
@@ -18,7 +18,6 @@
#ifndef DORIS_BE_TEST_QUERY_RUNTIME_TEST_ENV_H
#define DORIS_BE_TEST_QUERY_RUNTIME_TEST_ENV_H
-#include "runtime/disk_io_mgr.h"
#include "runtime/exec_env.h"
#include "runtime/runtime_state.h"
#include "runtime/tmp_file_mgr.h"
@@ -37,8 +36,6 @@ public:
// query states have been created.
void init_tmp_file_mgr(const std::vector<std::string>& tmp_dirs, bool one_dir_per_device);
- void init_buffer_pool(int64_t min_page_len, int64_t capacity, int64_t clean_pages_limit);
-
// If don't need to open, paths can be empty.
void init_storage_engine(bool need_open, const std::vector<std::string>& paths = {});
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org