You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2023/01/13 01:43:04 UTC
[doris] branch master updated: [refactor](remove unused code) remove buffer pool and disk io mgr (#15853)

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 16862d9b43 [refactor](remove unused code) remove buffer pool and disk io mgr (#15853)
16862d9b43 is described below

commit 16862d9b4389b7061f57cb0df702ba4a5456ecec
Author: yiguolei <67...@qq.com>
AuthorDate: Fri Jan 13 09:42:58 2023 +0800

    [refactor](remove unused code) remove buffer pool and disk io mgr (#15853)
    
    * [refactor](remove buffer pool and disk io mgr) remove unused code
    
    
    Co-authored-by: yiguolei <yi...@gmail.com>
---
 be/src/common/config.h                           |   10 +-
 be/src/common/daemon.cpp                         |   22 -
 be/src/common/daemon.h                           |    3 -
 be/src/exec/exec_node.cpp                        |   34 -
 be/src/exec/exec_node.h                          |   16 -
 be/src/runtime/CMakeLists.txt                    |    7 -
 be/src/runtime/bufferpool/buffer_allocator.cc    |  744 --------------
 be/src/runtime/bufferpool/buffer_allocator.h     |  241 -----
 be/src/runtime/bufferpool/buffer_pool.cc         |  667 ------------
 be/src/runtime/bufferpool/buffer_pool.h          |  466 ---------
 be/src/runtime/bufferpool/buffer_pool_counters.h |   43 -
 be/src/runtime/bufferpool/buffer_pool_internal.h |  299 ------
 be/src/runtime/bufferpool/free_list.h            |  115 ---
 be/src/runtime/bufferpool/suballocator.cc        |  252 -----
 be/src/runtime/bufferpool/suballocator.h         |  221 ----
 be/src/runtime/bufferpool/system_allocator.cc    |  166 ---
 be/src/runtime/bufferpool/system_allocator.h     |   49 -
 be/src/runtime/disk_io_mgr.cc                    | 1195 ----------------------
 be/src/runtime/disk_io_mgr.h                     |  837 ---------------
 be/src/runtime/disk_io_mgr_internal.h            |  455 --------
 be/src/runtime/disk_io_mgr_reader_context.cc     |  322 ------
 be/src/runtime/disk_io_mgr_scan_range.cc         |  481 ---------
 be/src/runtime/exec_env.h                        |   10 -
 be/src/runtime/exec_env_init.cpp                 |   48 -
 be/src/runtime/runtime_state.h                   |    1 -
 be/src/util/filesystem_util.h                    |    4 +-
 be/test/CMakeLists.txt                           |    1 -
 be/test/runtime/disk_io_mgr_test.cpp             | 1069 -------------------
 be/test/runtime/test_env.cc                      |    9 -
 be/test/runtime/test_env.h                       |    3 -
 30 files changed, 3 insertions(+), 7787 deletions(-)

diff --git a/be/src/common/config.h b/be/src/common/config.h
index 394ffd71c4..021410f26b 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -424,7 +424,7 @@ CONF_Int32(min_buffer_size, "1024"); // 1024, The minimum read buffer size (in b
 CONF_Int32(max_free_io_buffers, "128");
 
 // Whether to disable the memory cache pool,
-// including MemPool, ChunkAllocator, BufferPool, DiskIO free buffer.
+// including MemPool, ChunkAllocator, DiskIO free buffer.
 CONF_Bool(disable_mem_pools, "false");
 
 // The reserved bytes limit of Chunk Allocator, usually set as a percentage of mem_limit.
@@ -473,14 +473,6 @@ CONF_Bool(madvise_huge_pages, "false");
 // whether use mmap to allocate memory
 CONF_Bool(mmap_buffers, "false");
 
-// max memory can be allocated by buffer pool
-// This is the percentage of mem_limit
-CONF_String(buffer_pool_limit, "20%");
-
-// clean page can be hold by buffer pool
-// This is the percentage of buffer_pool_limit
-CONF_String(buffer_pool_clean_pages_limit, "50%");
-
 // Sleep time in milliseconds between memory maintenance iterations
 CONF_mInt64(memory_maintenance_sleep_time_ms, "500");
 
diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp
index 296df8afbd..17e8046f0b 100644
--- a/be/src/common/daemon.cpp
+++ b/be/src/common/daemon.cpp
@@ -50,7 +50,6 @@
 #include "geo/geo_functions.h"
 #include "olap/options.h"
 #include "runtime/block_spill_manager.h"
-#include "runtime/bufferpool/buffer_pool.h"
 #include "runtime/exec_env.h"
 #include "runtime/fragment_mgr.h"
 #include "runtime/load_channel_mgr.h"
@@ -188,20 +187,6 @@ void Daemon::tcmalloc_gc_thread() {
 #endif
 }
 
-void Daemon::buffer_pool_gc_thread() {
-    while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(10))) {
-        ExecEnv* env = ExecEnv::GetInstance();
-        // ExecEnv may not have been created yet or this may be the catalogd or statestored,
-        // which don't have ExecEnvs.
-        if (env != nullptr) {
-            BufferPool* buffer_pool = env->buffer_pool();
-            if (buffer_pool != nullptr) {
-                buffer_pool->Maintenance();
-            }
-        }
-    }
-}
-
 void Daemon::memory_maintenance_thread() {
     int64_t interval_milliseconds = config::memory_maintenance_sleep_time_ms;
     while (!_stop_background_threads_latch.wait_for(
@@ -429,10 +414,6 @@ void Daemon::start() {
             "Daemon", "tcmalloc_gc_thread", [this]() { this->tcmalloc_gc_thread(); },
             &_tcmalloc_gc_thread);
     CHECK(st.ok()) << st;
-    st = Thread::create(
-            "Daemon", "buffer_pool_gc_thread", [this]() { this->buffer_pool_gc_thread(); },
-            &_buffer_pool_gc_thread);
-    CHECK(st.ok()) << st;
     st = Thread::create(
             "Daemon", "memory_maintenance_thread", [this]() { this->memory_maintenance_thread(); },
             &_memory_maintenance_thread);
@@ -467,9 +448,6 @@ void Daemon::stop() {
     if (_tcmalloc_gc_thread) {
         _tcmalloc_gc_thread->join();
     }
-    if (_buffer_pool_gc_thread) {
-        _buffer_pool_gc_thread->join();
-    }
     if (_memory_maintenance_thread) {
         _memory_maintenance_thread->join();
     }
diff --git a/be/src/common/daemon.h b/be/src/common/daemon.h
index 39dbd4235f..525c1e1aa0 100644
--- a/be/src/common/daemon.h
+++ b/be/src/common/daemon.h
@@ -46,7 +46,6 @@ public:
 
 private:
     void tcmalloc_gc_thread();
-    void buffer_pool_gc_thread();
     void memory_maintenance_thread();
     void load_channel_tracker_refresh_thread();
     void calculate_metrics_thread();
@@ -54,8 +53,6 @@ private:
 
     CountDownLatch _stop_background_threads_latch;
     scoped_refptr<Thread> _tcmalloc_gc_thread;
-    // only buffer pool gc, will be removed after.
-    scoped_refptr<Thread> _buffer_pool_gc_thread;
     scoped_refptr<Thread> _memory_maintenance_thread;
     scoped_refptr<Thread> _load_channel_tracker_refresh_thread;
     scoped_refptr<Thread> _calculate_metrics_thread;
diff --git a/be/src/exec/exec_node.cpp b/be/src/exec/exec_node.cpp
index 3a2d713403..9bd0c5c494 100644
--- a/be/src/exec/exec_node.cpp
+++ b/be/src/exec/exec_node.cpp
@@ -224,10 +224,6 @@ void ExecNode::release_resource(doris::RuntimeState* state) {
         }
         vectorized::VExpr::close(_projections, state);
 
-        if (_buffer_pool_client.is_registered()) {
-            state->exec_env()->buffer_pool()->DeregisterClient(&_buffer_pool_client);
-        }
-
         runtime_profile()->add_to_span();
         _is_resource_released = true;
     }
@@ -597,36 +593,6 @@ void ExecNode::init_runtime_profile(const std::string& name) {
     _runtime_profile->set_metadata(_id);
 }
 
-Status ExecNode::claim_buffer_reservation(RuntimeState* state) {
-    DCHECK(!_buffer_pool_client.is_registered());
-    BufferPool* buffer_pool = ExecEnv::GetInstance()->buffer_pool();
-    // Check the minimum buffer size in case the minimum buffer size used by the planner
-    // doesn't match this backend's.
-    std::stringstream ss;
-    if (_resource_profile.__isset.spillable_buffer_size &&
-        _resource_profile.spillable_buffer_size < buffer_pool->min_buffer_len()) {
-        ss << "Spillable buffer size for node " << _id << " of "
-           << _resource_profile.spillable_buffer_size
-           << "bytes is less than the minimum buffer pool buffer size of "
-           << buffer_pool->min_buffer_len() << "bytes";
-        return Status::InternalError(ss.str());
-    }
-
-    ss << print_plan_node_type(_type) << " id=" << _id << " ptr=" << this;
-    RETURN_IF_ERROR(buffer_pool->RegisterClient(ss.str(), runtime_profile(), &_buffer_pool_client));
-
-    /*
-    if (debug_action_ == TDebugAction::SET_DENY_RESERVATION_PROBABILITY &&
-        (debug_phase_ == TExecNodePhase::PREPARE || debug_phase_ == TExecNodePhase::OPEN)) {
-       // We may not have been able to enable the debug action at the start of Prepare() or
-       // Open() because the client is not registered then. Do it now to be sure that it is
-       // effective.
-               RETURN_IF_ERROR(EnableDenyReservationDebugAction());
-    } 
-*/
-    return Status::OK();
-}
-
 void ExecNode::release_block_memory(vectorized::Block& block, uint16_t child_idx) {
     DCHECK(child_idx < _children.size());
     block.clear_column_data(child(child_idx)->row_desc().num_materialized_slots());
diff --git a/be/src/exec/exec_node.h b/be/src/exec/exec_node.h
index ff95d96934..353db077f3 100644
--- a/be/src/exec/exec_node.h
+++ b/be/src/exec/exec_node.h
@@ -26,7 +26,6 @@
 
 #include "common/status.h"
 #include "gen_cpp/PlanNodes_types.h"
-#include "runtime/bufferpool/buffer_pool.h"
 #include "runtime/descriptors.h"
 #include "runtime/query_statistics.h"
 #include "service/backend_options.h"
@@ -248,15 +247,6 @@ public:
 protected:
     friend class DataSink;
 
-    /// Initialize 'buffer_pool_client_' and claim the initial reservation for this
-    /// ExecNode. Only needs to be called by ExecNodes that will use the client.
-    /// The client is automatically cleaned up in Close(). Should not be called if
-    /// the client is already open.
-    /// The ExecNode must return the initial reservation to
-    /// QueryState::initial_reservations(), which is done automatically in Close() as long
-    /// as the initial reservation is not released before Close().
-    Status claim_buffer_reservation(RuntimeState* state);
-
     /// Release all memory of block which got from child. The block
     // 1. clear mem of valid column get from child, make sure child can reuse the mem
     // 2. delete and release the column which create by function all and other reason
@@ -315,12 +305,6 @@ protected:
     std::mutex _exec_options_lock;
     std::string _runtime_exec_options;
 
-    /// Buffer pool client for this node. Initialized with the node's minimum reservation
-    /// in ClaimBufferReservation(). After initialization, the client must hold onto at
-    /// least the minimum reservation so that it can be returned to the initial
-    /// reservations pool in Close().
-    BufferPool::ClientHandle _buffer_pool_client;
-
     // Set to true if this is a vectorized exec node.
     bool _is_vec = false;
 
diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt
index b2930ffb8a..dfc375bd24 100644
--- a/be/src/runtime/CMakeLists.txt
+++ b/be/src/runtime/CMakeLists.txt
@@ -56,16 +56,9 @@ set(RUNTIME_FILES
     load_path_mgr.cpp
     types.cpp
     tmp_file_mgr.cc
-    disk_io_mgr.cc
-    disk_io_mgr_reader_context.cc
-    disk_io_mgr_scan_range.cc 
     load_channel_mgr.cpp
     load_channel.cpp
     tablets_channel.cpp
-    bufferpool/buffer_allocator.cc
-    bufferpool/buffer_pool.cc
-    bufferpool/suballocator.cc
-    bufferpool/system_allocator.cc
     snapshot_loader.cpp
     query_statistics.cpp 
     message_body_sink.cpp
diff --git a/be/src/runtime/bufferpool/buffer_allocator.cc b/be/src/runtime/bufferpool/buffer_allocator.cc
deleted file mode 100644
index 1b2ef5c95a..0000000000
--- a/be/src/runtime/bufferpool/buffer_allocator.cc
+++ /dev/null
@@ -1,744 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/bufferpool/buffer_allocator.h"
-
-#include <mutex>
-
-#include "common/config.h"
-#include "runtime/bufferpool/system_allocator.h"
-#include "runtime/thread_context.h"
-#include "util/bit_util.h"
-#include "util/cpu_info.h"
-#include "util/pretty_printer.h"
-#include "util/runtime_profile.h"
-
-//DECLARE_bool(disable_mem_pools);
-
-namespace doris {
-
-/// Decrease 'bytes_remaining' by up to 'max_decrease', down to a minimum of 0.
-/// If 'require_full_decrease' is true, only decrease if we can decrease it
-/// 'max_decrease'. Returns the amount it was decreased by.
-static int64_t DecreaseBytesRemaining(int64_t max_decrease, bool require_full_decrease,
-                                      std::atomic<int64_t>* bytes_remaining);
-
-/// An arena containing free buffers and clean pages that are associated with a
-/// particular core. All public methods are thread-safe.
-class BufferPool::FreeBufferArena : public CacheLineAligned {
-public:
-    FreeBufferArena(BufferAllocator* parent);
-
-    // Destructor should only run in backend tests.
-    ~FreeBufferArena();
-
-    /// Add a free buffer to the free lists. May free buffers to the system allocator
-    /// if the list becomes full. Caller should not hold 'lock_'
-    bool AddFreeBuffer(BufferHandle&& buffer);
-
-    /// Try to get a free buffer of 'buffer_len' bytes from this arena. Returns true and
-    /// sets 'buffer' if found or false if not found. Caller should not hold 'lock_'.
-    bool PopFreeBuffer(int64_t buffer_len, BufferHandle* buffer);
-
-    /*
-  /// Try to get a buffer of 'buffer_len' bytes from this arena by evicting a clean page.
-  /// Returns true and sets 'buffer' if a clean page was evicted or false otherwise.
-  /// Caller should not hold 'lock_'
-  bool EvictCleanPage(int64_t buffer_len, BufferHandle* buffer);
-*/
-    /// Try to free 'target_bytes' of memory from this arena back to the system allocator.
-    /// Up to 'target_bytes_to_claim' will be given back to the caller, so it can allocate
-    /// a buffer of that size from the system. Any bytes freed in excess of
-    /// 'target_bytes_to_claim' are added to 'system_bytes_remaining_'. Returns the actual
-    /// number of bytes freed and the actual number of bytes claimed.
-    ///
-    /// Caller should not hold 'lock_'. If 'arena_lock' is non-null, ownership of the
-    /// arena lock is transferred to the caller. Uses std::unique_lock instead of
-    /// boost::std::unique_lock because it is movable.
-    std::pair<int64_t, int64_t> FreeSystemMemory(int64_t target_bytes_to_free,
-                                                 int64_t target_bytes_to_claim,
-                                                 std::unique_lock<SpinLock>* arena_lock);
-
-    /// Add a clean page to the arena. Caller must hold the page's client's lock and not
-    /// hold 'lock_' or any Page::lock_.
-    void AddCleanPage(Page* page);
-
-    /// Removes the clean page from the arena if present. Returns true if removed. If
-    /// 'claim_buffer' is true, the buffer is returned with the page, otherwise it is
-    /// added to the free buffer list. Caller must hold the page's client's lock and
-    /// not hold 'lock_' or any Page::lock_.
-    bool RemoveCleanPage(bool claim_buffer, Page* page);
-
-    /// Called periodically. Shrinks free lists that are holding onto more memory than
-    /// needed.
-    void Maintenance();
-
-    /// Test helper: gets the current size of the free list for buffers of 'len' bytes
-    /// on core 'core'.
-    int GetFreeListSize(int64_t len);
-
-    /// Return the total number of free buffers in the arena. May be approximate since
-    /// it doesn't acquire the arena lock.
-    int64_t GetNumFreeBuffers();
-
-    /// Return the total bytes of free buffers in the arena. May be approximate since
-    /// it doesn't acquire the arena lock.
-    int64_t GetFreeBufferBytes();
-
-    /// Return the total number of clean pages in the arena. May be approximate since
-    /// it doesn't acquire the arena lock.
-    int64_t GetNumCleanPages();
-
-    std::string DebugString();
-
-private:
-    /// The data structures for each power-of-two size of buffers/pages.
-    /// All members are protected by FreeBufferArena::lock_ unless otherwise mentioned.
-    struct PerSizeLists {
-        PerSizeLists() : num_free_buffers(0), low_water_mark(0), num_clean_pages(0) {}
-
-        /// Helper to add a free buffer and increment the counter.
-        /// FreeBufferArena::lock_ must be held by the caller.
-        void AddFreeBuffer(BufferHandle&& buffer) {
-            DCHECK_EQ(num_free_buffers.load(std::memory_order_acquire), free_buffers.Size());
-            num_free_buffers.fetch_add(1, std::memory_order_release);
-            free_buffers.AddFreeBuffer(std::move(buffer));
-        }
-
-        /// The number of entries in 'free_buffers'. Can be read without holding a lock to
-        /// allow threads to quickly skip over empty lists when trying to find a buffer.
-        std::atomic<int64_t> num_free_buffers;
-
-        /// Buffers that are not in use that were originally allocated on the core
-        /// corresponding to this arena.
-        FreeList free_buffers;
-
-        /// The minimum size of 'free_buffers' since the last Maintenance() call.
-        int low_water_mark;
-
-        /// The number of entries in 'clean_pages'.
-        /// Can be read without holding a lock to allow threads to quickly skip over empty
-        /// lists when trying to find a buffer in a different arena.
-        std::atomic<int64_t> num_clean_pages;
-
-        /// Unpinned pages that have had their contents written to disk. These pages can be
-        /// evicted to reclaim a buffer for any client. Pages are evicted in FIFO order,
-        /// so that pages are evicted in approximately the same order that the clients wrote
-        /// them to disk. Protected by FreeBufferArena::lock_.
-        InternalList<Page> clean_pages;
-    };
-
-    /// Return the number of buffer sizes for this allocator.
-    int NumBufferSizes() const {
-        return parent_->log_max_buffer_len_ - parent_->log_min_buffer_len_ + 1;
-    }
-
-    /// Return the lists of buffers for buffers of the given length.
-    PerSizeLists* GetListsForSize(int64_t buffer_len) {
-        DCHECK(BitUtil::IsPowerOf2(buffer_len));
-        int idx = BitUtil::Log2Ceiling64(buffer_len) - parent_->log_min_buffer_len_;
-        DCHECK_LT(idx, NumBufferSizes());
-        return &buffer_sizes_[idx];
-    }
-
-    /// Compute a sum over all the lists in the arena. Does not lock the arena.
-    int64_t SumOverSizes(
-            std::function<int64_t(PerSizeLists* lists, int64_t buffer_size)> compute_fn);
-
-    BufferAllocator* const parent_;
-
-    /// Protects all data structures in the arena. See buffer-pool-internal.h for lock
-    /// order.
-    SpinLock lock_;
-
-    /// Free buffers and clean pages for each buffer size for this arena.
-    /// Indexed by log2(bytes) - log2(min_buffer_len_).
-    PerSizeLists buffer_sizes_[LOG_MAX_BUFFER_BYTES + 1];
-};
-
-int64_t BufferPool::BufferAllocator::CalcMaxBufferLen(int64_t min_buffer_len,
-                                                      int64_t system_bytes_limit) {
-    // Find largest power of 2 smaller than 'system_bytes_limit'.
-    int64_t upper_bound =
-            system_bytes_limit == 0 ? 1L : 1L << BitUtil::Log2Floor64(system_bytes_limit);
-    upper_bound = std::min(MAX_BUFFER_BYTES, upper_bound);
-    return std::max(min_buffer_len, upper_bound); // Can't be < min_buffer_len.
-}
-
-BufferPool::BufferAllocator::BufferAllocator(BufferPool* pool, int64_t min_buffer_len,
-                                             int64_t system_bytes_limit,
-                                             int64_t clean_page_bytes_limit)
-        : pool_(pool),
-          system_allocator_(new SystemAllocator(min_buffer_len)),
-          min_buffer_len_(min_buffer_len),
-          max_buffer_len_(CalcMaxBufferLen(min_buffer_len, system_bytes_limit)),
-          log_min_buffer_len_(BitUtil::Log2Ceiling64(min_buffer_len_)),
-          log_max_buffer_len_(BitUtil::Log2Ceiling64(max_buffer_len_)),
-          system_bytes_limit_(system_bytes_limit),
-          system_bytes_remaining_(system_bytes_limit),
-          clean_page_bytes_limit_(clean_page_bytes_limit),
-          clean_page_bytes_remaining_(clean_page_bytes_limit),
-          per_core_arenas_(CpuInfo::get_max_num_cores()),
-          max_scavenge_attempts_(MAX_SCAVENGE_ATTEMPTS),
-          _mem_tracker(std::make_unique<MemTracker>("BufferAllocator")) {
-    DCHECK(BitUtil::IsPowerOf2(min_buffer_len_)) << min_buffer_len_;
-    DCHECK(BitUtil::IsPowerOf2(max_buffer_len_)) << max_buffer_len_;
-    DCHECK_LE(0, min_buffer_len_);
-    DCHECK_LE(min_buffer_len_, max_buffer_len_);
-    DCHECK_LE(max_buffer_len_, MAX_BUFFER_BYTES);
-    DCHECK_LE(max_buffer_len_, std::max(system_bytes_limit_, min_buffer_len_));
-
-    for (std::unique_ptr<FreeBufferArena>& arena : per_core_arenas_) {
-        arena.reset(new FreeBufferArena(this));
-    }
-}
-
-BufferPool::BufferAllocator::~BufferAllocator() {
-    per_core_arenas_.clear(); // Release all the memory.
-    // Check for accounting leaks.
-    DCHECK_EQ(system_bytes_limit_, system_bytes_remaining_.load(std::memory_order_acquire));
-    DCHECK_EQ(clean_page_bytes_limit_, clean_page_bytes_remaining_.load(std::memory_order_acquire));
-}
-
-Status BufferPool::BufferAllocator::Allocate(ClientHandle* client, int64_t len,
-                                             BufferHandle* buffer) {
-    SCOPED_TIMER(client->impl_->counters().alloc_time);
-    COUNTER_UPDATE(client->impl_->counters().cumulative_bytes_alloced, len);
-    COUNTER_UPDATE(client->impl_->counters().cumulative_allocations, 1);
-
-    RETURN_IF_ERROR(AllocateInternal(len, buffer));
-    DCHECK(buffer->is_open());
-    buffer->client_ = client;
-    return Status::OK();
-}
-
-Status BufferPool::BufferAllocator::AllocateInternal(int64_t len, BufferHandle* buffer) {
-    DCHECK(!buffer->is_open());
-    DCHECK_GE(len, min_buffer_len_);
-    DCHECK(BitUtil::IsPowerOf2(len)) << len;
-
-    std::stringstream err_stream;
-    if (UNLIKELY(len > MAX_BUFFER_BYTES)) {
-        err_stream << "Tried to allocate buffer of " << len << " bytes"
-                   << " max of " << MAX_BUFFER_BYTES << " bytes";
-        return Status::InternalError(err_stream.str());
-    }
-    if (UNLIKELY(len > system_bytes_limit_)) {
-        err_stream << "Tried to allocate buffer of " << len << " bytes"
-                   << " > buffer pool limit of  " << system_bytes_limit_ << " bytes";
-        return Status::InternalError(err_stream.str());
-    }
-
-    const int current_core = CpuInfo::get_current_core();
-    // Fast path: recycle a buffer of the correct size from this core's arena.
-    FreeBufferArena* current_core_arena = per_core_arenas_[current_core].get();
-    if (current_core_arena->PopFreeBuffer(len, buffer)) return Status::OK();
-
-    // Fast-ish path: allocate a new buffer if there is room in 'system_bytes_remaining_'.
-    int64_t delta = DecreaseBytesRemaining(len, true, &system_bytes_remaining_);
-    if (delta != len) {
-        DCHECK_EQ(0, delta);
-        const std::vector<int>& numa_node_cores =
-                CpuInfo::get_cores_of_same_numa_node(current_core);
-        const int numa_node_core_idx = CpuInfo::get_numa_node_core_idx(current_core);
-
-        // Fast-ish path: find a buffer of the right size from another core on the same
-        // NUMA node. Avoid getting a buffer from another NUMA node - prefer reclaiming
-        // a clean page on this NUMA node or scavenging then reallocating a new buffer.
-        // We don't want to get into a state where allocations between the nodes are
-        // unbalanced and one node is stuck reusing memory allocated on the other node.
-        for (int i = 1; i < numa_node_cores.size(); ++i) {
-            // Each core should start searching from a different point to avoid hot-spots.
-            int other_core = numa_node_cores[(numa_node_core_idx + i) % numa_node_cores.size()];
-            FreeBufferArena* other_core_arena = per_core_arenas_[other_core].get();
-            if (other_core_arena->PopFreeBuffer(len, buffer)) return Status::OK();
-        }
-
-        /*
-    // Fast-ish path: evict a clean page of the right size from the current NUMA node.
-    for (int i = 0; i < numa_node_cores.size(); ++i) {
-      int other_core = numa_node_cores[(numa_node_core_idx + i) % numa_node_cores.size()];
-      FreeBufferArena* other_core_arena = per_core_arenas_[other_core].get();
-      if (other_core_arena->EvictCleanPage(len, buffer)) return Status::OK();
-    }
-*/
-        // Slow path: scavenge buffers of different sizes from free buffer lists and clean
-        // pages. Make initial, fast attempts to gather the required buffers, before
-        // finally making a slower, but guaranteed-to-succeed attempt.
-        // TODO: IMPALA-4703: add a stress option where we vary the number of attempts
-        // randomly.
-        int attempt = 0;
-        while (attempt < max_scavenge_attempts_ && delta < len) {
-            bool final_attempt = attempt == max_scavenge_attempts_ - 1;
-            delta += ScavengeBuffers(final_attempt, current_core, len - delta);
-            ++attempt;
-        }
-        if (delta < len) {
-            system_bytes_remaining_.fetch_add(delta, std::memory_order_release);
-            // This indicates an accounting bug - we should be able to always get the memory.
-            std::stringstream err_stream;
-            err_stream << "Could not allocate : " << len << "bytes: was only able to free up "
-                       << delta << " bytes after " << max_scavenge_attempts_ << " attempts:\n"
-                       << pool_->DebugString();
-            return Status::InternalError(err_stream.str());
-        }
-    }
-    // We have headroom to allocate a new buffer at this point.
-    DCHECK_EQ(delta, len);
-    Status status = system_allocator_->Allocate(len, buffer);
-    if (!status.ok()) {
-        system_bytes_remaining_.fetch_add(len, std::memory_order_release);
-        return status;
-    }
-    _mem_tracker->consume(len);
-    return Status::OK();
-}
-
-int64_t DecreaseBytesRemaining(int64_t max_decrease, bool require_full_decrease,
-                               std::atomic<int64_t>* bytes_remaining) {
-    while (true) {
-        int64_t old_value = bytes_remaining->load(std::memory_order_acquire);
-        if (require_full_decrease && old_value < max_decrease) return 0;
-        int64_t decrease = std::min(old_value, max_decrease);
-        int64_t new_value = old_value - decrease;
-        if (bytes_remaining->compare_exchange_weak(old_value, new_value,
-                                                   std::memory_order_release)) {
-            return decrease;
-        }
-    }
-}
-
-int64_t BufferPool::BufferAllocator::ScavengeBuffers(bool slow_but_sure, int current_core,
-                                                     int64_t target_bytes) {
-    // There are two strategies for scavenging buffers:
-    // 1) Fast, opportunistic: Each arena is searched in succession. Although reservations
-    //    guarantee that the memory we need is available somewhere, this may fail if we
-    //    we race with another thread that returned buffers to an arena that we've already
-    //    searched and took the buffers from an arena we haven't yet searched.
-    // 2) Slow, guaranteed to succeed: In order to ensure that we can find the memory in a
-    //    single pass, we hold locks for all arenas we've already examined. That way, other
-    //    threads can't take the memory that we need from an arena that we haven't yet
-    //    examined (or from 'system_bytes_available_') because in order to do so, it would
-    //    have had to return the equivalent amount of memory to an earlier arena or added
-    //    it back into 'systems_bytes_remaining_'. The former can't happen since we're
-    //    still holding those locks, and the latter is solved by trying to decrease
-    //    system_bytes_remaining_ with DecreaseBytesRemaining() at the end.
-    DCHECK_GT(target_bytes, 0);
-    // First make sure we've used up all the headroom in the buffer limit.
-    int64_t bytes_found = DecreaseBytesRemaining(target_bytes, false, &system_bytes_remaining_);
-    if (bytes_found == target_bytes) return bytes_found;
-
-    // In 'slow_but_sure' mode, we will hold locks for multiple arenas at the same time and
-    // therefore must start at 0 to respect the lock order. Otherwise we start with the
-    // current core's arena for locality and to avoid excessive contention on arena 0.
-    int start_core = slow_but_sure ? 0 : current_core;
-    std::vector<std::unique_lock<SpinLock>> arena_locks;
-    if (slow_but_sure) arena_locks.resize(per_core_arenas_.size());
-
-    for (int i = 0; i < per_core_arenas_.size(); ++i) {
-        int core_to_check = (start_core + i) % per_core_arenas_.size();
-        FreeBufferArena* arena = per_core_arenas_[core_to_check].get();
-        int64_t bytes_needed = target_bytes - bytes_found;
-        bytes_found += arena->FreeSystemMemory(bytes_needed, bytes_needed,
-                                               slow_but_sure ? &arena_locks[i] : nullptr)
-                               .second;
-        if (bytes_found == target_bytes) break;
-    }
-    DCHECK_LE(bytes_found, target_bytes);
-
-    // Decrement 'system_bytes_remaining_' while still holding the arena locks to avoid
-    // the window for a race with another thread that removes a buffer from a list and
-    // then increments 'system_bytes_remaining_'. The race is prevented because the other
-    // thread holds the lock while decrementing 'system_bytes_remaining_' in the cases
-    // where it may not have reservation corresponding to that memory.
-    if (slow_but_sure && bytes_found < target_bytes) {
-        bytes_found +=
-                DecreaseBytesRemaining(target_bytes - bytes_found, true, &system_bytes_remaining_);
-        // Deadlock in arena_locks in BufferPool::BufferAllocator::ScavengeBuffers and _lock in DebugString
-        // DCHECK_EQ(bytes_found, target_bytes) << DebugString();
-    }
-    return bytes_found;
-}
-
-void BufferPool::BufferAllocator::Free(BufferHandle&& handle) {
-    DCHECK(handle.is_open());
-    handle.client_ = nullptr; // Buffer is no longer associated with a client.
-    FreeBufferArena* arena = per_core_arenas_[handle.home_core_].get();
-    handle.Poison();
-    if (!arena->AddFreeBuffer(std::move(handle))) {
-        _mem_tracker->release(handle.len());
-    }
-}
-
-void BufferPool::BufferAllocator::AddCleanPage(const std::unique_lock<std::mutex>& client_lock,
-                                               Page* page) {
-    page->client->DCheckHoldsLock(client_lock);
-    FreeBufferArena* arena = per_core_arenas_[page->buffer.home_core_].get();
-    arena->AddCleanPage(page);
-}
-
-bool BufferPool::BufferAllocator::RemoveCleanPage(const std::unique_lock<std::mutex>& client_lock,
-                                                  bool claim_buffer, Page* page) {
-    page->client->DCheckHoldsLock(client_lock);
-    FreeBufferArena* arena;
-    {
-        std::lock_guard<SpinLock> pl(page->buffer_lock);
-        // Page may be evicted - in which case it has no home core and is not in an arena.
-        if (!page->buffer.is_open()) return false;
-        arena = per_core_arenas_[page->buffer.home_core_].get();
-    }
-    return arena->RemoveCleanPage(claim_buffer, page);
-}
-
-void BufferPool::BufferAllocator::Maintenance() {
-    for (std::unique_ptr<FreeBufferArena>& arena : per_core_arenas_) arena->Maintenance();
-}
-
-void BufferPool::BufferAllocator::ReleaseMemory(int64_t bytes_to_free) {
-    int64_t bytes_freed = 0;
-    int current_core = CpuInfo::get_current_core();
-    for (int i = 0; i < per_core_arenas_.size(); ++i) {
-        int core_to_check = (current_core + i) % per_core_arenas_.size();
-        FreeBufferArena* arena = per_core_arenas_[core_to_check].get();
-        // Free but don't claim any memory.
-        bytes_freed += arena->FreeSystemMemory(bytes_to_free - bytes_freed, 0, nullptr).first;
-        if (bytes_freed >= bytes_to_free) return;
-    }
-}
-
-int BufferPool::BufferAllocator::GetFreeListSize(int core, int64_t len) {
-    return per_core_arenas_[core]->GetFreeListSize(len);
-}
-
-int64_t BufferPool::BufferAllocator::FreeToSystem(std::vector<BufferHandle>&& buffers) {
-    int64_t bytes_freed = 0;
-    for (BufferHandle& buffer : buffers) {
-        bytes_freed += buffer.len();
-        // Ensure that the memory is unpoisoned when it's next allocated by the system.
-        buffer.Unpoison();
-        system_allocator_->Free(std::move(buffer));
-    }
-    _mem_tracker->release(bytes_freed);
-    return bytes_freed;
-}
-
-int64_t BufferPool::BufferAllocator::SumOverArenas(
-        std::function<int64_t(FreeBufferArena* arena)> compute_fn) const {
-    int64_t total = 0;
-    for (const std::unique_ptr<FreeBufferArena>& arena : per_core_arenas_) {
-        total += compute_fn(arena.get());
-    }
-    return total;
-}
-
-int64_t BufferPool::BufferAllocator::GetNumFreeBuffers() const {
-    return SumOverArenas([](FreeBufferArena* arena) { return arena->GetNumFreeBuffers(); });
-}
-
-int64_t BufferPool::BufferAllocator::GetFreeBufferBytes() const {
-    return SumOverArenas([](FreeBufferArena* arena) { return arena->GetFreeBufferBytes(); });
-}
-
-int64_t BufferPool::BufferAllocator::GetNumCleanPages() const {
-    return SumOverArenas([](FreeBufferArena* arena) { return arena->GetNumCleanPages(); });
-}
-
-int64_t BufferPool::BufferAllocator::GetCleanPageBytesLimit() const {
-    return clean_page_bytes_limit_;
-}
-
-int64_t BufferPool::BufferAllocator::GetCleanPageBytes() const {
-    return clean_page_bytes_limit_ - clean_page_bytes_remaining_.load(std::memory_order_acquire);
-}
-
-std::string BufferPool::BufferAllocator::DebugString() {
-    std::stringstream ss;
-    ss << "<BufferAllocator> " << this << " min_buffer_len: " << min_buffer_len_
-       << " system_bytes_limit: " << system_bytes_limit_
-       << " system_bytes_remaining: " << system_bytes_remaining_.load(std::memory_order_acquire)
-       << "\n"
-       << " clean_page_bytes_limit: " << clean_page_bytes_limit_ << " clean_page_bytes_remaining: "
-       << clean_page_bytes_remaining_.load(std::memory_order_acquire) << "\n";
-    for (int i = 0; i < per_core_arenas_.size(); ++i) {
-        ss << "  Arena " << i << " " << per_core_arenas_[i]->DebugString() << "\n";
-    }
-    return ss.str();
-}
-
-BufferPool::FreeBufferArena::FreeBufferArena(BufferAllocator* parent) : parent_(parent) {}
-
-BufferPool::FreeBufferArena::~FreeBufferArena() {
-    for (int i = 0; i < NumBufferSizes(); ++i) {
-        // Clear out the free lists.
-        FreeList* list = &buffer_sizes_[i].free_buffers;
-        std::vector<BufferHandle> buffers = list->GetBuffersToFree(list->Size());
-        parent_->system_bytes_remaining_.fetch_add(parent_->FreeToSystem(std::move(buffers)),
-                                                   std::memory_order_release);
-
-        // All pages should have been destroyed.
-        DCHECK_EQ(0, buffer_sizes_[i].clean_pages.size());
-    }
-}
-
-bool BufferPool::FreeBufferArena::AddFreeBuffer(BufferHandle&& buffer) {
-    std::lock_guard<SpinLock> al(lock_);
-    if (config::disable_mem_pools) {
-        int64_t len = buffer.len();
-        parent_->system_allocator_->Free(std::move(buffer));
-        parent_->system_bytes_remaining_.fetch_add(len, std::memory_order_release);
-        return false;
-    }
-    PerSizeLists* lists = GetListsForSize(buffer.len());
-    lists->AddFreeBuffer(std::move(buffer));
-    return true;
-}
-
-bool BufferPool::FreeBufferArena::RemoveCleanPage(bool claim_buffer, Page* page) {
-    std::lock_guard<SpinLock> al(lock_);
-    PerSizeLists* lists = GetListsForSize(page->len);
-    DCHECK_EQ(lists->num_clean_pages.load(std::memory_order_acquire), lists->clean_pages.size());
-    if (!lists->clean_pages.remove(page)) return false;
-    lists->num_clean_pages.fetch_sub(1, std::memory_order_release);
-    parent_->clean_page_bytes_remaining_.fetch_add(page->len, std::memory_order_release);
-    if (!claim_buffer) {
-        BufferHandle buffer;
-        {
-            std::lock_guard<SpinLock> pl(page->buffer_lock);
-            buffer = std::move(page->buffer);
-        }
-        lists->AddFreeBuffer(std::move(buffer));
-    }
-    return true;
-}
-
-bool BufferPool::FreeBufferArena::PopFreeBuffer(int64_t buffer_len, BufferHandle* buffer) {
-    PerSizeLists* lists = GetListsForSize(buffer_len);
-    // Check before acquiring lock.
-    if (lists->num_free_buffers.load(std::memory_order_acquire) == 0) return false;
-
-    std::lock_guard<SpinLock> al(lock_);
-    FreeList* list = &lists->free_buffers;
-    DCHECK_EQ(lists->num_free_buffers.load(std::memory_order_acquire), list->Size());
-    if (!list->PopFreeBuffer(buffer)) return false;
-    buffer->Unpoison();
-    lists->num_free_buffers.fetch_sub(1, std::memory_order_release);
-    lists->low_water_mark = std::min<int>(lists->low_water_mark, list->Size());
-    return true;
-}
-/*
-bool BufferPool::FreeBufferArena::EvictCleanPage(
-    int64_t buffer_len, BufferHandle* buffer) {
-  PerSizeLists* lists = GetListsForSize(buffer_len);
-  // Check before acquiring lock.
-  if (lists->num_clean_pages.Load() == 0) return false;
-
-  std::lock_guard<SpinLock> al(lock_);
-  DCHECK_EQ(lists->num_clean_pages.Load(), lists->clean_pages.size());
-  Page* page = lists->clean_pages.dequeue();
-  if (page == nullptr) return false;
-  lists->num_clean_pages.Add(-1);
-  parent_->clean_page_bytes_remaining_.Add(buffer_len);
-  std::lock_guard<SpinLock> pl(page->buffer_lock);
-  *buffer = std::move(page->buffer);
-  return true;
-}
-*/
-std::pair<int64_t, int64_t> BufferPool::FreeBufferArena::FreeSystemMemory(
-        int64_t target_bytes_to_free, int64_t target_bytes_to_claim,
-        std::unique_lock<SpinLock>* arena_lock) {
-    DCHECK_GT(target_bytes_to_free, 0);
-    DCHECK_GE(target_bytes_to_free, target_bytes_to_claim);
-    int64_t bytes_freed = 0;
-    // If the caller is acquiring the lock, just lock for the whole method.
-    // Otherwise lazily acquire the lock the first time we find some memory
-    // to free.
-    std::unique_lock<SpinLock> al(lock_, std::defer_lock_t());
-    if (arena_lock != nullptr) al.lock();
-
-    std::vector<BufferHandle> buffers;
-    // Search from largest to smallest to avoid freeing many small buffers unless
-    // necessary.
-    for (int i = NumBufferSizes() - 1; i >= 0; --i) {
-        PerSizeLists* lists = &buffer_sizes_[i];
-        // Check before acquiring lock to avoid expensive lock acquisition and make scanning
-        // empty lists much cheaper.
-        if (lists->num_free_buffers.load(std::memory_order_acquire) == 0 &&
-            lists->num_clean_pages.load(std::memory_order_acquire) == 0) {
-            continue;
-        }
-        if (!al.owns_lock()) al.lock();
-        FreeList* free_buffers = &lists->free_buffers;
-        InternalList<Page>* clean_pages = &lists->clean_pages;
-        DCHECK_EQ(lists->num_free_buffers.load(std::memory_order_acquire), free_buffers->Size());
-        DCHECK_EQ(lists->num_clean_pages.load(std::memory_order_acquire), clean_pages->size());
-
-        // Figure out how many of the buffers in the free list we should free.
-        DCHECK_GT(target_bytes_to_free, bytes_freed);
-        const int64_t buffer_len = 1L << (i + parent_->log_min_buffer_len_);
-        int64_t buffers_to_free =
-                std::min(free_buffers->Size(),
-                         BitUtil::Ceil(target_bytes_to_free - bytes_freed, buffer_len));
-        int64_t buffer_bytes_to_free = buffers_to_free * buffer_len;
-
-        // Evict clean pages by moving their buffers to the free page list before freeing
-        // them. This ensures that they are freed based on memory address in the expected
-        // order.
-        int num_pages_evicted = 0;
-        int64_t page_bytes_evicted = 0;
-        while (bytes_freed + buffer_bytes_to_free < target_bytes_to_free) {
-            Page* page = clean_pages->dequeue();
-            if (page == nullptr) break;
-            BufferHandle page_buffer;
-            {
-                std::lock_guard<SpinLock> pl(page->buffer_lock);
-                page_buffer = std::move(page->buffer);
-            }
-            ++buffers_to_free;
-            buffer_bytes_to_free += page_buffer.len();
-            ++num_pages_evicted;
-            page_bytes_evicted += page_buffer.len();
-            free_buffers->AddFreeBuffer(std::move(page_buffer));
-        }
-        lists->num_free_buffers.fetch_add(num_pages_evicted, std::memory_order_release);
-        lists->num_clean_pages.fetch_sub(num_pages_evicted, std::memory_order_release);
-        parent_->clean_page_bytes_remaining_.fetch_add(page_bytes_evicted,
-                                                       std::memory_order_release);
-
-        if (buffers_to_free > 0) {
-            int64_t buffer_bytes_freed =
-                    parent_->FreeToSystem(free_buffers->GetBuffersToFree(buffers_to_free));
-            DCHECK_EQ(buffer_bytes_to_free, buffer_bytes_freed);
-            bytes_freed += buffer_bytes_to_free;
-            lists->num_free_buffers.fetch_sub(buffers_to_free, std::memory_order_release);
-            lists->low_water_mark = std::min<int>(lists->low_water_mark, free_buffers->Size());
-            if (bytes_freed >= target_bytes_to_free) break;
-        }
-        // Should have cleared out all lists if we don't have enough memory at this point.
-        DCHECK_EQ(0, free_buffers->Size());
-        DCHECK_EQ(0, clean_pages->size());
-    }
-    int64_t bytes_claimed = std::min(bytes_freed, target_bytes_to_claim);
-    if (bytes_freed > bytes_claimed) {
-        // Add back the extra for other threads before releasing the lock to avoid race
-        // where the other thread may not be able to find enough buffers.
-        parent_->system_bytes_remaining_.fetch_add((bytes_freed - bytes_claimed),
-                                                   std::memory_order_release);
-    }
-    if (arena_lock != nullptr) *arena_lock = std::move(al);
-    return std::make_pair(bytes_freed, bytes_claimed);
-}
-
-void BufferPool::FreeBufferArena::AddCleanPage(Page* page) {
-    bool eviction_needed =
-            config::disable_mem_pools ||
-            DecreaseBytesRemaining(page->len, true, &parent_->clean_page_bytes_remaining_) == 0;
-    std::lock_guard<SpinLock> al(lock_);
-    PerSizeLists* lists = GetListsForSize(page->len);
-    DCHECK_EQ(lists->num_clean_pages.load(std::memory_order_acquire), lists->clean_pages.size());
-    if (eviction_needed) {
-        if (lists->clean_pages.empty()) {
-            // No other pages to evict, must evict 'page' instead of adding it.
-            lists->AddFreeBuffer(std::move(page->buffer));
-        } else {
-            // Evict an older page (FIFO eviction) to make space for this one.
-            Page* page_to_evict = lists->clean_pages.dequeue();
-            lists->clean_pages.enqueue(page);
-            BufferHandle page_to_evict_buffer;
-            {
-                std::lock_guard<SpinLock> pl(page_to_evict->buffer_lock);
-                page_to_evict_buffer = std::move(page_to_evict->buffer);
-            }
-            lists->AddFreeBuffer(std::move(page_to_evict_buffer));
-        }
-    } else {
-        lists->clean_pages.enqueue(page);
-        lists->num_clean_pages.fetch_add(1, std::memory_order_release);
-    }
-}
-
-void BufferPool::FreeBufferArena::Maintenance() {
-    std::lock_guard<SpinLock> al(lock_);
-    for (int i = 0; i < NumBufferSizes(); ++i) {
-        PerSizeLists* lists = &buffer_sizes_[i];
-        DCHECK_LE(lists->low_water_mark, lists->free_buffers.Size());
-        if (lists->low_water_mark != 0) {
-            // We haven't needed the buffers below the low water mark since the previous
-            // Maintenance() call. Discard half of them to free up memory. By always discarding
-            // at least one, we guarantee that an idle list will shrink to zero entries.
-            int num_to_free = std::max(1, lists->low_water_mark / 2);
-            parent_->system_bytes_remaining_.fetch_add(
-                    parent_->FreeToSystem(lists->free_buffers.GetBuffersToFree(num_to_free)),
-                    std::memory_order_release);
-            lists->num_free_buffers.fetch_sub(num_to_free, std::memory_order_release);
-        }
-        lists->low_water_mark = lists->free_buffers.Size();
-    }
-}
-
-int BufferPool::FreeBufferArena::GetFreeListSize(int64_t len) {
-    std::lock_guard<SpinLock> al(lock_);
-    PerSizeLists* lists = GetListsForSize(len);
-    DCHECK_EQ(lists->num_free_buffers.load(std::memory_order_acquire), lists->free_buffers.Size());
-    return lists->free_buffers.Size();
-}
-
-int64_t BufferPool::FreeBufferArena::SumOverSizes(
-        std::function<int64_t(PerSizeLists* lists, int64_t buffer_size)> compute_fn) {
-    int64_t total = 0;
-    for (int i = 0; i < NumBufferSizes(); ++i) {
-        int64_t buffer_size = (1L << i) * parent_->min_buffer_len_;
-        total += compute_fn(&buffer_sizes_[i], buffer_size);
-    }
-    return total;
-}
-
-int64_t BufferPool::FreeBufferArena::GetNumFreeBuffers() {
-    return SumOverSizes([](PerSizeLists* lists, int64_t buffer_size) {
-        return lists->num_free_buffers.load(std::memory_order_acquire);
-    });
-}
-
-int64_t BufferPool::FreeBufferArena::GetFreeBufferBytes() {
-    return SumOverSizes([](PerSizeLists* lists, int64_t buffer_size) {
-        return lists->num_free_buffers.load(std::memory_order_acquire) * buffer_size;
-    });
-}
-
-int64_t BufferPool::FreeBufferArena::GetNumCleanPages() {
-    return SumOverSizes([](PerSizeLists* lists, int64_t buffer_size) {
-        return lists->num_clean_pages.load(std::memory_order_acquire);
-    });
-}
-
-std::string BufferPool::FreeBufferArena::DebugString() {
-    std::lock_guard<SpinLock> al(lock_);
-    std::stringstream ss;
-    ss << "<FreeBufferArena> " << this << "\n";
-    for (int i = 0; i < NumBufferSizes(); ++i) {
-        int64_t buffer_len = 1L << (parent_->log_min_buffer_len_ + i);
-        PerSizeLists& lists = buffer_sizes_[i];
-        ss << "  " << PrettyPrinter::print_bytes(buffer_len) << ":"
-           << " free buffers: " << lists.num_free_buffers.load(std::memory_order_acquire)
-           << " low water mark: " << lists.low_water_mark
-           << " clean pages: " << lists.num_clean_pages.load(std::memory_order_acquire) << " ";
-        lists.clean_pages.iterate(
-                std::bind<bool>(Page::DebugStringCallback, &ss, std::placeholders::_1));
-        ss << "\n";
-    }
-    return ss.str();
-}
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/buffer_allocator.h b/be/src/runtime/bufferpool/buffer_allocator.h
deleted file mode 100644
index cf2a0f741e..0000000000
--- a/be/src/runtime/bufferpool/buffer_allocator.h
+++ /dev/null
@@ -1,241 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "runtime/bufferpool/buffer_pool_internal.h"
-#include "runtime/bufferpool/free_list.h"
-#include "runtime/memory/mem_tracker.h"
-#include "util/aligned_new.h"
-
-namespace doris {
-
-/// The internal buffer allocator used by BufferPool to allocator power-of-two sized
-/// buffers. BufferAllocator builds on top of SystemAllocator by adding caching of
-/// free buffers and clean pages where the memory is not currently in use by a client
-/// but has not yet been released to SystemAllocator.
-///
-/// The allocator is optimised for the common case where an allocation can be served
-/// by reclaiming a buffer of the request size from the current core's arena. In this
-/// case there is no contention for locks between concurrently-running threads. If this
-/// fails, progressively more expensive approaches to allocate memory are tried until
-/// the allocation eventually success (see AllocateInternal() for details).
-///
-/// Buffer Reservations
-/// ===================
-/// The implementation of the BufferAllocator relies on the BufferPool's reservation
-/// tracking system. The allocator is given a hard limit ('system_bytes_limit'), above
-/// which all allocations will fail. Allocations up to 'system_bytes_limit' are
-/// guaranteed to succeed unless an unexpected system error occurs (e.g. we can't allocate
-/// all of the required memory from the OS). Reservations must be set up so that the total
-/// of all reservations does not exceed 'system_bytes_limit', thus ensuring that
-/// BufferAllocator can always find memory to fulfill reservations.
-///
-/// +========================+
-/// | IMPLEMENTATION NOTES   |
-/// +========================+
-///
-/// Memory
-/// ======
-/// Memory managed by BufferAllocator comes in four forms:
-/// 1. Buffers returned to the client (corresponding to a used reservation)
-/// 2. Free buffers cached in the BufferAllocator's free lists.
-/// 3. Buffers attached to clean unpinned pages in the BufferAllocator's clean page lists.
-/// 4. Bytes that are not allocated from the system: 'system_bytes_remaining_'.
-/// Together these always add up to 'system_bytes_limit', which allows BufferAllocator
-/// to always fulfill reservations via some combination of memory in forms 2, 3 or 4.
-///
-/// The BufferAllocator code is careful not to make memory inaccessible to concurrently
-/// executing threads that are entitled to it. E.g. if one thread is entitled to allocate
-/// a 1MB buffer from the BufferAllocator's free or clean page lists but needs to release
-/// a 2MB buffer to the system to free up enough memory, it must add 1MB to
-/// 'system_bytes_remaining_' in the same critical section in which it freed the 2MB
-/// buffer. Otherwise a concurrent thread that had a reservation for 1MB of memory might
-/// not be able to find it.
-///
-/// Arenas
-/// ======
-/// The buffer allocator's data structures are broken up into arenas, with an arena per
-/// core. Within each arena, each buffer or page is stored in a list with buffers and
-/// pages of the same size: there is a separate list for every power-of-two size. Each
-/// arena is protected by a separate lock, so in the common case where threads are able
-/// to fulfill allocations from their own arena, there will be no lock contention.
-///
-struct BufferPool::BufferAllocator {
-    BufferAllocator(BufferPool* pool, int64_t min_buffer_len, int64_t system_bytes_limit,
-                    int64_t clean_page_bytes_limit);
-    ~BufferAllocator();
-
-    /// Allocate a buffer with a power-of-two length 'len'. This function may acquire
-    /// 'FreeBufferArena::lock_' and Page::lock so no locks lower in the lock acquisition
-    /// order (see buffer-pool-internal.h) should be held by the caller.
-    ///
-    /// Always succeeds on allocating memory up to 'system_bytes_limit', unless the system
-    /// is unable to give us 'system_bytes_limit' of memory or an internal bug: if all
-    /// clients write out enough dirty pages to stay within their reservation, then there
-    /// should always be enough free buffers and clean pages to reclaim.
-    Status Allocate(ClientHandle* client, int64_t len,
-                    BufferPool::BufferHandle* buffer) WARN_UNUSED_RESULT;
-
-    /// Frees 'buffer', which must be open before calling. Closes 'buffer' and updates
-    /// internal state but does not release to any reservation.
-    void Free(BufferPool::BufferHandle&& buffer);
-
-    /// Adds a clean page 'page' to a clean page list. Caller must hold the page's
-    /// client's lock via 'client_lock' so that moving the page between the client list and
-    /// the free page list is atomic. Caller must not hold 'FreeBufferArena::lock_' or any
-    /// Page::lock.
-    void AddCleanPage(const std::unique_lock<std::mutex>& client_lock, Page* page);
-
-    /// Removes a clean page 'page' from a clean page list and returns true, if present in
-    /// one of the lists. Returns true if it was present. If 'claim_buffer' is true, the
-    /// caller must have reservation for the buffer, which is returned along with the page.
-    /// Otherwise the buffer is moved directly to the free buffer list. Caller must hold
-    /// the page's client's lock via 'client_lock' so that moving the page between the
-    /// client list and the free page list is atomic. Caller must not hold
-    /// 'FreeBufferArena::lock_' or any Page::lock.
-    bool RemoveCleanPage(const std::unique_lock<std::mutex>& client_lock, bool claim_buffer,
-                         Page* page);
-
-    /// Periodically called to release free buffers back to the SystemAllocator. Releases
-    /// buffers based on recent allocation patterns, trying to minimise the number of
-    /// excess buffers retained in each list above the minimum required to avoid going
-    /// to the system allocator.
-    void Maintenance();
-
-    /// Try to release at least 'bytes_to_free' bytes of memory to the system allocator.
-    void ReleaseMemory(int64_t bytes_to_free);
-
-    int64_t system_bytes_limit() const { return system_bytes_limit_; }
-
-    /// Return the amount of memory currently allocated from the system.
-    int64_t GetSystemBytesAllocated() const {
-        return system_bytes_limit_ - system_bytes_remaining_.load();
-    }
-
-    /// Return the total number of free buffers in the allocator.
-    int64_t GetNumFreeBuffers() const;
-
-    /// Return the total bytes of free buffers in the allocator.
-    int64_t GetFreeBufferBytes() const;
-
-    /// Return the limit on bytes of clean pages in the allocator.
-    int64_t GetCleanPageBytesLimit() const;
-
-    /// Return the total number of clean pages in the allocator.
-    int64_t GetNumCleanPages() const;
-
-    /// Return the total bytes of clean pages in the allocator.
-    int64_t GetCleanPageBytes() const;
-
-    std::string DebugString();
-
-protected:
-    friend class BufferAllocatorTest;
-    friend class BufferPoolTest;
-    friend class FreeBufferArena;
-
-    /// Test helper: gets the current size of the free list for buffers of 'len' bytes
-    /// on core 'core'.
-    int GetFreeListSize(int core, int64_t len);
-
-    /// Test helper: reduce the number of scavenge attempts so backend tests can force
-    /// use of the "locked" scavenging code path.
-    void set_max_scavenge_attempts(int val) {
-        DCHECK_GE(val, 1);
-        max_scavenge_attempts_ = val;
-    }
-
-private:
-    /// Compute the maximum power-of-two buffer length that could be allocated based on the
-    /// amount of memory available 'system_bytes_limit'. The value is always at least
-    /// 'min_buffer_len' so that there is at least one valid buffer size.
-    static int64_t CalcMaxBufferLen(int64_t min_buffer_len, int64_t system_bytes_limit);
-
-    /// Same as Allocate() but leaves 'buffer->client_' nullptr and does not update counters.
-    Status AllocateInternal(int64_t len, BufferPool::BufferHandle* buffer) WARN_UNUSED_RESULT;
-
-    /// Tries to reclaim enough memory from various sources so that the caller can allocate
-    /// a buffer of 'target_bytes' from the system allocator. Scavenges buffers from the
-    /// free buffer and clean page lists of all cores and frees them with
-    /// 'system_allocator_'. Also tries to decrement 'system_bytes_remaining_'.
-    /// 'current_core' is the index of the current CPU core. Any bytes freed in excess of
-    /// 'target_bytes' are added to 'system_bytes_remaining_.' If 'slow_but_sure' is true,
-    /// this function uses a slower strategy that guarantees enough memory will be found
-    /// but can block progress of other threads for longer. If 'slow_but_sure' is false,
-    /// then this function optimistically tries to reclaim the memory but may not reclaim
-    /// 'target_bytes' of memory. Returns the number of bytes reclaimed.
-    int64_t ScavengeBuffers(bool slow_but_sure, int current_core, int64_t target_bytes);
-
-    /// Helper to free a list of buffers to the system. Returns the number of bytes freed.
-    int64_t FreeToSystem(std::vector<BufferHandle>&& buffers);
-
-    /// Compute a sum over all arenas. Does not lock the arenas.
-    int64_t SumOverArenas(std::function<int64_t(FreeBufferArena* arena)> compute_fn) const;
-
-    /// The pool that this allocator is associated with.
-    BufferPool* const pool_;
-
-    /// System allocator that is ultimately used to allocate and free buffers.
-    const std::unique_ptr<SystemAllocator> system_allocator_;
-
-    /// The minimum power-of-two buffer length that can be allocated.
-    const int64_t min_buffer_len_;
-
-    /// The maximum power-of-two buffer length that can be allocated. Always >=
-    /// 'min_buffer_len' so that there is at least one valid buffer size.
-    const int64_t max_buffer_len_;
-
-    /// The log2 of 'min_buffer_len_'.
-    const int log_min_buffer_len_;
-
-    /// The log2 of 'max_buffer_len_'.
-    const int log_max_buffer_len_;
-
-    /// The maximum physical memory in bytes that will be allocated from the system.
-    const int64_t system_bytes_limit_;
-
-    /// The remaining number of bytes of 'system_bytes_limit_' that can be used for
-    /// allocating new buffers. Must be updated atomically before a new buffer is
-    /// allocated or after an existing buffer is freed with the system allocator.
-    std::atomic<int64_t> system_bytes_remaining_;
-
-    /// The maximum bytes of clean pages that can accumulate across all arenas before
-    /// they will be evicted.
-    const int64_t clean_page_bytes_limit_;
-
-    /// The number of bytes of 'clean_page_bytes_limit_' not used by clean pages. I.e.
-    /// (clean_page_bytes_limit - bytes of clean pages in the BufferAllocator).
-    /// 'clean_pages_bytes_limit_' is enforced by increasing this value before a
-    /// clean page is added and decreasing it after a clean page is reclaimed or evicted.
-    std::atomic<int64_t> clean_page_bytes_remaining_;
-
-    /// Free and clean pages. One arena per core.
-    std::vector<std::unique_ptr<FreeBufferArena>> per_core_arenas_;
-
-    /// Default number of times to attempt scavenging.
-    static const int MAX_SCAVENGE_ATTEMPTS = 3;
-
-    /// Number of times to attempt scavenging. Usually MAX_SCAVENGE_ATTEMPTS but can be
-    /// overridden by tests. The first max_scavenge_attempts_ - 1 attempts do not lock
-    /// all arenas so may fail. The final attempt locks all arenas, which is expensive
-    /// but is guaranteed to succeed.
-    int max_scavenge_attempts_;
-
-    std::unique_ptr<MemTracker> _mem_tracker;
-};
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/buffer_pool.cc b/be/src/runtime/bufferpool/buffer_pool.cc
deleted file mode 100644
index 9d11c0f58d..0000000000
--- a/be/src/runtime/bufferpool/buffer_pool.cc
+++ /dev/null
@@ -1,667 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <limits>
-#include <sstream>
-
-#include "gutil/strings/substitute.h"
-#include "runtime/bufferpool/buffer_allocator.h"
-#include "runtime/bufferpool/buffer_pool_internal.h"
-#include "util/bit_util.h"
-#include "util/cpu_info.h"
-#include "util/runtime_profile.h"
-#include "util/time.h"
-#include "util/uid_util.h"
-
-//DEFINE_int32(concurrent_scratch_ios_per_device, 2,
-//    "Set this to influence the number of concurrent write I/Os issues to write data to "
-//    "scratch files. This is multiplied by the number of active scratch directories to "
-//    "obtain the target number of scratch write I/Os per query.");
-
-namespace doris {
-
-constexpr int BufferPool::LOG_MAX_BUFFER_BYTES;
-constexpr int64_t BufferPool::MAX_BUFFER_BYTES;
-
-void BufferPool::BufferHandle::Open(uint8_t* data, int64_t len, int home_core) {
-    DCHECK_LE(0, home_core);
-    DCHECK_LT(home_core, CpuInfo::get_max_num_cores());
-    client_ = nullptr;
-    data_ = data;
-    len_ = len;
-    home_core_ = home_core;
-}
-
-BufferPool::PageHandle::PageHandle() {
-    Reset();
-}
-
-BufferPool::PageHandle::PageHandle(PageHandle&& src) {
-    Reset();
-    *this = std::move(src);
-}
-
-BufferPool::PageHandle& BufferPool::PageHandle::operator=(PageHandle&& src) {
-    DCHECK(!is_open());
-    // Copy over all members then close src.
-    page_ = src.page_;
-    client_ = src.client_;
-    src.Reset();
-    return *this;
-}
-
-void BufferPool::PageHandle::Open(Page* page, ClientHandle* client) {
-    DCHECK(!is_open());
-    page_ = page;
-    client_ = client;
-}
-
-void BufferPool::PageHandle::Reset() {
-    page_ = nullptr;
-    client_ = nullptr;
-}
-
-int BufferPool::PageHandle::pin_count() const {
-    DCHECK(is_open());
-    // The pin count can only be modified via this PageHandle, which must not be
-    // concurrently accessed by multiple threads, so it is safe to access without locking
-    return page_->pin_count;
-}
-
-int64_t BufferPool::PageHandle::len() const {
-    DCHECK(is_open());
-    return page_->len; // Does not require locking.
-}
-
-Status BufferPool::PageHandle::GetBuffer(const BufferHandle** buffer) const {
-    DCHECK(is_open());
-    DCHECK(client_->is_registered());
-    DCHECK(is_pinned());
-    /*
-  if (page_->pin_in_flight) {
-    // Finish the work started in Pin().
-    RETURN_IF_ERROR(client_->impl_->FinishMoveEvictedToPinned(page_));
-  }
-*/
-    DCHECK(!page_->pin_in_flight);
-    *buffer = &page_->buffer;
-    DCHECK((*buffer)->is_open());
-    return Status::OK();
-}
-
-BufferPool::BufferPool(int64_t min_buffer_len, int64_t buffer_bytes_limit,
-                       int64_t clean_page_bytes_limit)
-        : allocator_(new BufferAllocator(this, min_buffer_len, buffer_bytes_limit,
-                                         clean_page_bytes_limit)),
-          min_buffer_len_(min_buffer_len) {
-    CHECK_GT(min_buffer_len, 0);
-    CHECK_EQ(min_buffer_len, BitUtil::RoundUpToPowerOfTwo(min_buffer_len));
-}
-
-BufferPool::~BufferPool() {}
-
-Status BufferPool::RegisterClient(const string& name, RuntimeProfile* profile,
-                                  ClientHandle* client) {
-    DCHECK(!client->is_registered());
-    client->impl_ = new Client(this, //file_group,
-                               name, profile);
-    return Status::OK();
-}
-
-void BufferPool::DeregisterClient(ClientHandle* client) {
-    if (!client->is_registered()) return;
-    client->impl_->Close(); // Will DCHECK if any remaining buffers or pinned pages.
-    delete client->impl_;   // Will DCHECK if there are any remaining pages.
-    client->impl_ = nullptr;
-}
-
-Status BufferPool::CreatePage(ClientHandle* client, int64_t len, PageHandle* handle,
-                              const BufferHandle** buffer) {
-    DCHECK(!handle->is_open());
-    DCHECK_GE(len, min_buffer_len_);
-    DCHECK_EQ(len, BitUtil::RoundUpToPowerOfTwo(len));
-
-    BufferHandle new_buffer;
-    // No changes have been made to state yet, so we can cleanly return on error.
-    RETURN_IF_ERROR(AllocateBuffer(client, len, &new_buffer));
-    Page* page = client->impl_->CreatePinnedPage(std::move(new_buffer));
-    handle->Open(page, client);
-    if (buffer != nullptr) *buffer = &page->buffer;
-    return Status::OK();
-}
-
-void BufferPool::DestroyPage(ClientHandle* client, PageHandle* handle) {
-    if (!handle->is_open()) return; // DestroyPage() should be idempotent.
-
-    if (handle->is_pinned()) {
-        // Cancel the read I/O - we don't need the data any more.
-        //if (handle->page_->pin_in_flight) {
-        //  handle->page_->write_handle->CancelRead();
-        //  handle->page_->pin_in_flight = false;
-        //}
-        // In the pinned case, delegate to ExtractBuffer() and FreeBuffer() to do the work
-        // of cleaning up the page, freeing the buffer and updating reservations correctly.
-        BufferHandle buffer;
-        Status status = ExtractBuffer(client, handle, &buffer);
-        DCHECK(status.ok()) << status;
-        FreeBuffer(client, &buffer);
-    } else {
-        // In the unpinned case, no reservations are used so we just clean up the page.
-        client->impl_->DestroyPageInternal(handle);
-    }
-}
-
-Status BufferPool::Pin(ClientHandle* client, PageHandle* handle) {
-    DCHECK(client->is_registered());
-    DCHECK(handle->is_open());
-    DCHECK_EQ(handle->client_, client);
-
-    Page* page = handle->page_;
-    if (page->pin_count == 0) {
-        RETURN_IF_ERROR(client->impl_->StartMoveToPinned(client, page));
-        COUNTER_UPDATE(client->impl_->counters().peak_unpinned_bytes, -page->len);
-    }
-    // Update accounting last to avoid complicating the error return path above.
-    ++page->pin_count;
-    return Status::OK();
-}
-
-void BufferPool::Unpin(ClientHandle* client, PageHandle* handle) {
-    DCHECK(handle->is_open());
-    DCHECK(client->is_registered());
-    DCHECK_EQ(handle->client_, client);
-    // If handle is pinned, we can assume that the page itself is pinned.
-    DCHECK(handle->is_pinned());
-    Page* page = handle->page_;
-
-    if (--page->pin_count > 0) return;
-    //if (page->pin_in_flight) {
-    // Data is not in memory - move it back to evicted.
-    //  client->impl_->UndoMoveEvictedToPinned(page);
-    //} else {
-    // Data is in memory - move it to dirty unpinned.
-    client->impl_->MoveToDirtyUnpinned(page);
-    //}
-    COUNTER_UPDATE(client->impl_->counters().peak_unpinned_bytes, handle->len());
-}
-
-Status BufferPool::ExtractBuffer(ClientHandle* client, PageHandle* page_handle,
-                                 BufferHandle* buffer_handle) {
-    DCHECK(page_handle->is_pinned());
-    DCHECK(!buffer_handle->is_open());
-    DCHECK_EQ(page_handle->client_, client);
-
-    // If an async pin is in flight, we need to wait for it.
-    const BufferHandle* dummy;
-    RETURN_IF_ERROR(page_handle->GetBuffer(&dummy));
-
-    // Bring the pin count to 1 so that we're not using surplus reservations.
-    while (page_handle->pin_count() > 1) Unpin(client, page_handle);
-
-    // Destroy the page and extract the buffer.
-    client->impl_->DestroyPageInternal(page_handle, buffer_handle);
-    DCHECK(buffer_handle->is_open());
-    return Status::OK();
-}
-
-Status BufferPool::AllocateBuffer(ClientHandle* client, int64_t len, BufferHandle* handle) {
-    RETURN_IF_ERROR(client->impl_->PrepareToAllocateBuffer(len));
-    Status status = allocator_->Allocate(client, len, handle);
-    if (!status.ok()) {
-        // Allocation failed - update client's accounting to reflect the failure.
-        client->impl_->FreedBuffer(len);
-    }
-    return status;
-}
-
-void BufferPool::FreeBuffer(ClientHandle* client, BufferHandle* handle) {
-    if (!handle->is_open()) return; // Should be idempotent.
-    DCHECK_EQ(client, handle->client_);
-    int64_t len = handle->len_;
-    allocator_->Free(std::move(*handle));
-    client->impl_->FreedBuffer(len);
-}
-
-Status BufferPool::TransferBuffer(ClientHandle* src_client, BufferHandle* src,
-                                  ClientHandle* dst_client, BufferHandle* dst) {
-    DCHECK(src->is_open());
-    DCHECK(!dst->is_open());
-    DCHECK_EQ(src_client, src->client_);
-    DCHECK_NE(src, dst);
-    DCHECK_NE(src_client, dst_client);
-
-    *dst = std::move(*src);
-    dst->client_ = dst_client;
-    return Status::OK();
-}
-
-void BufferPool::Maintenance() {
-    allocator_->Maintenance();
-}
-
-void BufferPool::ReleaseMemory(int64_t bytes_to_free) {
-    allocator_->ReleaseMemory(bytes_to_free);
-}
-
-int64_t BufferPool::GetSystemBytesLimit() const {
-    return allocator_->system_bytes_limit();
-}
-
-int64_t BufferPool::GetSystemBytesAllocated() const {
-    return allocator_->GetSystemBytesAllocated();
-}
-
-int64_t BufferPool::GetCleanPageBytesLimit() const {
-    return allocator_->GetCleanPageBytesLimit();
-}
-
-int64_t BufferPool::GetNumCleanPages() const {
-    return allocator_->GetNumCleanPages();
-}
-
-int64_t BufferPool::GetCleanPageBytes() const {
-    return allocator_->GetCleanPageBytes();
-}
-
-int64_t BufferPool::GetNumFreeBuffers() const {
-    return allocator_->GetNumFreeBuffers();
-}
-
-int64_t BufferPool::GetFreeBufferBytes() const {
-    return allocator_->GetFreeBufferBytes();
-}
-
-bool BufferPool::ClientHandle::has_unpinned_pages() const {
-    return impl_->has_unpinned_pages();
-}
-
-BufferPool::Client::Client(BufferPool* pool, //TmpFileMgr::FileGroup* file_group,
-                           const string& name, RuntimeProfile* profile)
-        : pool_(pool),
-          //file_group_(file_group),
-          name_(name),
-          debug_write_delay_ms_(0),
-          num_pages_(0),
-          buffers_allocated_bytes_(0) {
-    // Set up a child profile with buffer pool info.
-    RuntimeProfile* child_profile = profile->create_child("Buffer pool", true, true);
-    counters_.alloc_time = ADD_TIMER(child_profile, "AllocTime");
-    counters_.cumulative_allocations =
-            ADD_COUNTER(child_profile, "CumulativeAllocations", TUnit::UNIT);
-    counters_.cumulative_bytes_alloced =
-            ADD_COUNTER(child_profile, "CumulativeAllocationBytes", TUnit::BYTES);
-    counters_.peak_unpinned_bytes =
-            child_profile->AddHighWaterMarkCounter("PeakUnpinnedBytes", TUnit::BYTES);
-}
-
-BufferPool::Page* BufferPool::Client::CreatePinnedPage(BufferHandle&& buffer) {
-    Page* page = new Page(this, buffer.len());
-    page->buffer = std::move(buffer);
-    page->pin_count = 1;
-
-    std::lock_guard<std::mutex> lock(lock_);
-    // The buffer is transferred to the page so will be accounted for in
-    // pinned_pages_.bytes() instead of buffers_allocated_bytes_.
-    buffers_allocated_bytes_ -= page->len;
-    pinned_pages_.enqueue(page);
-    ++num_pages_;
-    DCHECK_CONSISTENCY();
-    return page;
-}
-
-void BufferPool::Client::DestroyPageInternal(PageHandle* handle, BufferHandle* out_buffer) {
-    DCHECK(handle->is_pinned() || out_buffer == nullptr);
-    Page* page = handle->page_;
-    // Remove the page from the list that it is currently present in (if any).
-    {
-        std::unique_lock<std::mutex> cl(lock_);
-        // First try to remove from the pinned or dirty unpinned lists.
-        if (!pinned_pages_.remove(page) && !dirty_unpinned_pages_.remove(page)) {
-            // The page either has a write in flight, is clean, or is evicted.
-            // Let the write complete, if in flight.
-            //WaitForWrite(&cl, page);
-            // If clean, remove it from the clean pages list. If evicted, this is a no-op.
-            pool_->allocator_->RemoveCleanPage(cl, out_buffer != nullptr, page);
-        }
-        DCHECK(!page->in_queue());
-        --num_pages_;
-    }
-
-    //if (page->write_handle != nullptr) {
-    // Discard any on-disk data.
-    //file_group_->DestroyWriteHandle(move(page->write_handle));
-    //}
-    //
-    if (out_buffer != nullptr) {
-        DCHECK(page->buffer.is_open());
-        *out_buffer = std::move(page->buffer);
-        buffers_allocated_bytes_ += out_buffer->len();
-    } else if (page->buffer.is_open()) {
-        pool_->allocator_->Free(std::move(page->buffer));
-    }
-    delete page;
-    handle->Reset();
-}
-
-void BufferPool::Client::MoveToDirtyUnpinned(Page* page) {
-    // Only valid to unpin pages if spilling is enabled.
-    // DCHECK(spilling_enabled());
-    DCHECK_EQ(0, page->pin_count);
-
-    std::unique_lock<std::mutex> lock(lock_);
-    DCHECK_CONSISTENCY();
-    DCHECK(pinned_pages_.contains(page));
-    pinned_pages_.remove(page);
-    dirty_unpinned_pages_.enqueue(page);
-
-    // Check if we should initiate writes for this (or another) dirty page.
-    //WriteDirtyPagesAsync();
-}
-
-Status BufferPool::Client::StartMoveToPinned(ClientHandle* client, Page* page) {
-    std::unique_lock<std::mutex> cl(lock_);
-    DCHECK_CONSISTENCY();
-    // Propagate any write errors that occurred for this client.
-    //RETURN_IF_ERROR(write_status_i;
-
-    if (dirty_unpinned_pages_.remove(page)) {
-        // No writes were initiated for the page - just move it back to the pinned state.
-        pinned_pages_.enqueue(page);
-        return Status::OK();
-    }
-
-    return Status::InternalError("start move to pinned error, page is not in dirty.");
-    /*
-  if (in_flight_write_pages_.contains(page)) {
-    // A write is in flight. If so, wait for it to complete - then we only have to
-    // handle the pinned and evicted cases.
-    WaitForWrite(&cl, page);
-    RETURN_IF_ERROR(write_status_); // The write may have set 'write_status_'.
-  }
-
-  // At this point we need to either reclaim a clean page or allocate a new buffer.
-  // We may need to clean some pages to do so.
-  RETURN_IF_ERROR(CleanPages(&cl, page->len));
-  if (pool_->allocator_->RemoveCleanPage(cl, true, page)) {
-    // The clean page still has an associated buffer. Restore the data, and move the page
-    // back to the pinned state.
-    pinned_pages_.enqueue(page);
-    DCHECK(page->buffer.is_open());
-    DCHECK(page->write_handle != nullptr);
-    // Don't need on-disk data.
-    cl.unlock(); // Don't block progress for other threads operating on other pages.
-    return file_group_->RestoreData(move(page->write_handle), page->buffer.mem_range());
-  }
-  // If the page wasn't in the clean pages list, it must have been evicted.
-  return StartMoveEvictedToPinned(&cl, client, page);
-*/
-}
-/*
-Status BufferPool::Client::StartMoveEvictedToPinned(
-    unique_lock<std::mutex>* client_lock, ClientHandle* client, Page* page) {
-  DCHECK(!page->buffer.is_open());
-
-  // Safe to modify the page's buffer handle without holding the page lock because no
-  // concurrent operations can modify evicted pages.
-  BufferHandle buffer;
-  RETURN_IF_ERROR(pool_->allocator_->Allocate(client, page->len, &page->buffer));
-  COUNTER_ADD(counters().bytes_read, page->len);
-  COUNTER_ADD(counters().read_io_ops, 1);
-  RETURN_IF_ERROR(
-      file_group_->ReadAsync(page->write_handle.get(), page->buffer.mem_range()));
-  pinned_pages_.enqueue(page);
-  page->pin_in_flight = true;
-  DCHECK_CONSISTENCY();
-  return Status::OK();
-}
-
-void BufferPool::Client::UndoMoveEvictedToPinned(Page* page) {
-  // We need to get the page back to the evicted state where:
-  // * There is no in-flight read.
-  // * The page's data is on disk referenced by 'write_handle'
-  // * The page has no attached buffer.
-  DCHECK(page->pin_in_flight);
-  page->write_handle->CancelRead();
-  page->pin_in_flight = false;
-
-  unique_lock<std::mutex> lock(lock_);
-  DCHECK_CONSISTENCY();
-  DCHECK(pinned_pages_.contains(page));
-  pinned_pages_.remove(page);
-  // Discard the buffer - the pin was in flight so there was no way that a valid
-  // reference to the buffer's contents was returned since the pin was still in flight.
-  pool_->allocator_->Free(move(page->buffer));
-}
-*/
-/*
-Status BufferPool::Client::FinishMoveEvictedToPinned(Page* page) {
-  DCHECK(page->pin_in_flight);
-  SCOPED_TIMER(counters().read_wait_time);
-  // Don't hold any locks while reading back the data. It is safe to modify the page's
-  // buffer handle without holding any locks because no concurrent operations can modify
-  // evicted pages.
-  RETURN_IF_ERROR(
-      file_group_->WaitForAsyncRead(page->write_handle.get(), page->buffer.mem_range()));
-  file_group_->DestroyWriteHandle(move(page->write_handle));
-  page->pin_in_flight = false;
-  return Status::OK();
-}
-*/
-Status BufferPool::Client::PrepareToAllocateBuffer(int64_t len) {
-    std::unique_lock<std::mutex> lock(lock_);
-    // Clean enough pages to allow allocation to proceed without violating our eviction
-    // policy. This can fail, so only update the accounting once success is ensured.
-    //RETURN_IF_ERROR(CleanPages(&lock, len));
-    buffers_allocated_bytes_ += len;
-    DCHECK_CONSISTENCY();
-    return Status::OK();
-}
-
-Status BufferPool::Client::CleanPages(std::unique_lock<std::mutex>* client_lock, int64_t len) {
-    DCheckHoldsLock(*client_lock);
-    DCHECK_CONSISTENCY();
-    /*
-  // Work out what we need to get bytes of dirty unpinned + in flight pages down to
-  // in order to satisfy the eviction policy.
-  int64_t target_dirty_bytes = reservation_.GetReservation() - buffers_allocated_bytes_
-      - pinned_pages_.bytes() - len;
-  // Start enough writes to ensure that the loop condition below will eventually become
-  // false (or a write error will be encountered).
-  int64_t min_bytes_to_write =
-      max<int64_t>(0, dirty_unpinned_pages_.bytes() - target_dirty_bytes);
-  //WriteDirtyPagesAsync(min_bytes_to_write);
-
-  // One of the writes we initiated, or an earlier in-flight write may have hit an error.
-  RETURN_IF_ERROR(write_status_);
-
-  // Wait until enough writes have finished so that we can make the allocation without
-  // violating the eviction policy. I.e. so that other clients can immediately get the
-  // memory they're entitled to without waiting for this client's write to complete.
-  DCHECK_GE(in_flight_write_pages_.bytes(), min_bytes_to_write);
-  while (dirty_unpinned_pages_.bytes() + in_flight_write_pages_.bytes()
-      > target_dirty_bytes) {
-    SCOPED_TIMER(counters().write_wait_time);
-    write_complete_cv_.Wait(*client_lock);
-    RETURN_IF_ERROR(write_status_); // Check if error occurred while waiting.
-  }
-*/
-    return Status::OK();
-}
-/*
-void BufferPool::Client::WriteDirtyPagesAsync(int64_t min_bytes_to_write) {
-  DCHECK_GE(min_bytes_to_write, 0);
-  DCHECK_LE(min_bytes_to_write, dirty_unpinned_pages_.bytes());
- // if (file_group_ == nullptr) {
-    // Spilling disabled - there should be no unpinned pages to write.
-    DCHECK_EQ(0, min_bytes_to_write);
-    DCHECK_EQ(0, dirty_unpinned_pages_.bytes());
-    return;
-////  }
-
-  // No point in starting writes if an error occurred because future operations for the
-  // client will fail regardless.
-  if (!write_status_.ok()) return;
-
-  // Compute the ideal amount of writes to start. We use a simple heuristic based on the
-  // total number of writes. The FileGroup's allocation should spread the writes across
-  // disks somewhat, but doesn't guarantee we're fully using all available disks. In
-  // future we could track the # of writes per-disk.
-  const int64_t target_writes = FLAGS_concurrent_scratch_ios_per_device
-      * file_group_->tmp_file_mgr()->NumActiveTmpDevices();
-
-  int64_t bytes_written = 0;
-  while (!dirty_unpinned_pages_.empty()
-      && (bytes_written < min_bytes_to_write
-             || in_flight_write_pages_.size() < target_writes)) {
-    Page* page = dirty_unpinned_pages_.tail(); // LIFO.
-    DCHECK(page != nullptr) << "Should have been enough dirty unpinned pages";
-    {
-      std::lock_guard<SpinLock> pl(page->buffer_lock);
-      DCHECK(file_group_ != nullptr);
-      DCHECK(page->buffer.is_open());
-      COUNTER_ADD(counters().bytes_written, page->len);
-      COUNTER_ADD(counters().write_io_ops, 1);
-      Status status = file_group_->Write(page->buffer.mem_range(),
-          [this, page](const Status& write_status) {
-            WriteCompleteCallback(page, write_status);
-          },
-          &page->write_handle);
-      // Exit early on error: there is no point in starting more writes because future
-      /// operations for this client will fail regardless.
-      if (!status.ok()) {
-        write_status_.MergeStatus(status);
-        return;
-      }
-    }
-    // Now that the write is in flight, update all the state
-    Page* tmp = dirty_unpinned_pages_.pop_back();
-    DCHECK_EQ(tmp, page);
-    in_flight_write_pages_.enqueue(page);
-    bytes_written += page->len;
-  } 
-}
-
-void BufferPool::Client::WriteCompleteCallback(Page* page, const Status& write_status) {
-#ifndef NDEBUG
-  if (debug_write_delay_ms_ > 0) SleepForMs(debug_write_delay_ms_);
-#endif
-  {
-    std::unique_lock<std::mutex> cl(lock_);
-    DCHECK(in_flight_write_pages_.contains(page));
-    // The status should always be propagated.
-    // TODO: if we add cancellation support to TmpFileMgr, consider cancellation path.
-    if (!write_status.ok()) write_status_.MergeStatus(write_status);
-    in_flight_write_pages_.remove(page);
-    // Move to clean pages list even if an error was encountered - the buffer can be
-    // repurposed by other clients and 'write_status_' must be checked by this client
-    // before reading back the bad data.
-    pool_->allocator_->AddCleanPage(cl, page);
-    WriteDirtyPagesAsync(); // Start another asynchronous write if needed.
-
-    // Notify before releasing lock to avoid race with Page and Client destruction.
-    page->write_complete_cv_.NotifyAll();
-    write_complete_cv_.NotifyAll();
-  }
-}
-
-void BufferPool::Client::WaitForWrite(std::unique_lock<std::mutex>* client_lock, Page* page) {
-  DCheckHoldsLock(*client_lock);
-  while (in_flight_write_pages_.contains(page)) {
-    SCOPED_TIMER(counters().write_wait_time);
-    page->write_complete_cv_.Wait(*client_lock);
-  }
-}
-
-void BufferPool::Client::WaitForAllWrites() {
-  std::unique_lock<std::mutex> cl(lock_);
-  while (in_flight_write_pages_.size() > 0) {
-    write_complete_cv_.Wait(cl);
-  }
-}
-*/
-string BufferPool::Client::DebugString() {
-    std::lock_guard<std::mutex> lock(lock_);
-    std::stringstream ss;
-    ss << "<BufferPool::Client> " << this << " name: " << name_
-       << " write_status: " << write_status_ << " buffers allocated " << buffers_allocated_bytes_
-       << " num_pages: " << num_pages_ << " pinned_bytes: " << pinned_pages_.bytes()
-       << " dirty_unpinned_bytes: " << dirty_unpinned_pages_.bytes()
-       << " in_flight_write_bytes: " << in_flight_write_pages_.bytes();
-    ss << "\n  " << pinned_pages_.size() << " pinned pages: ";
-    pinned_pages_.iterate(std::bind<bool>(Page::DebugStringCallback, &ss, std::placeholders::_1));
-    ss << "\n  " << dirty_unpinned_pages_.size() << " dirty unpinned pages: ";
-    dirty_unpinned_pages_.iterate(
-            std::bind<bool>(Page::DebugStringCallback, &ss, std::placeholders::_1));
-    ss << "\n  " << in_flight_write_pages_.size() << " in flight write pages: ";
-    in_flight_write_pages_.iterate(
-            std::bind<bool>(Page::DebugStringCallback, &ss, std::placeholders::_1));
-    return ss.str();
-}
-
-string BufferPool::ClientHandle::DebugString() const {
-    std::stringstream ss;
-    if (is_registered()) {
-        ss << "<BufferPool::Client> " << this << " internal state: {" << impl_->DebugString()
-           << "}";
-        return ss.str();
-    } else {
-        ss << "<BufferPool::ClientHandle> " << this << " UNREGISTERED";
-        return ss.str();
-    }
-}
-/*
-string BufferPool::PageHandle::DebugString() const {
-  if (is_open()) {
-    std::lock_guard<SpinLock> pl(page_->buffer_lock);
-    return Substitute("<BufferPool::PageHandle> $0 client: $1/$2 page: {$3}", this,
-        client_, client_->impl_, page_->DebugString());
-  } else {
-    return Substitute("<BufferPool::PageHandle> $0 CLOSED", this);
-  }
-}
-*/
-string BufferPool::Page::DebugString() {
-    std::stringstream ss;
-    ss << "<BufferPool::Page> " << this << " len: " << len << " pin_count:" << pin_count
-       << " buf:" << buffer.DebugString();
-    return ss.str();
-}
-
-bool BufferPool::Page::DebugStringCallback(std::stringstream* ss, BufferPool::Page* page) {
-    std::lock_guard<SpinLock> pl(page->buffer_lock);
-    (*ss) << page->DebugString() << "\n";
-    return true;
-}
-
-string BufferPool::BufferHandle::DebugString() const {
-    std::stringstream ss;
-    if (is_open()) {
-        ss << "<BufferPool::BufferHandle> " << this << " client: " << client_ << "/"
-           << client_->impl_ << " data: " << data_ << " len: " << len_;
-    } else {
-        ss << "<BufferPool::BufferHandle> " << this << " CLOSED";
-    }
-    return ss.str();
-}
-
-string BufferPool::DebugString() {
-    std::stringstream ss;
-    ss << "<BufferPool> " << this << " min_buffer_len: " << min_buffer_len_ << "\n"
-       << allocator_->DebugString();
-    return ss.str();
-}
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/buffer_pool.h b/be/src/runtime/bufferpool/buffer_pool.h
deleted file mode 100644
index 469f5071db..0000000000
--- a/be/src/runtime/bufferpool/buffer_pool.h
+++ /dev/null
@@ -1,466 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <stdint.h>
-
-#include <string>
-#include <vector>
-
-#include "common/compiler_util.h"
-#include "common/object_pool.h"
-#include "common/status.h"
-#include "gutil/dynamic_annotations.h"
-#include "gutil/macros.h"
-#include "util/aligned_new.h"
-#include "util/internal_queue.h"
-#include "util/mem_range.h"
-#include "util/spinlock.h"
-
-namespace doris {
-
-class RuntimeProfile;
-class SystemAllocator;
-class MemTracker;
-
-/// A buffer pool that manages memory buffers for all queries in an Impala daemon.
-/// The buffer pool enforces buffer reservations, limits, and implements policies
-/// for moving spilled memory from in-memory buffers to disk. It also enables reuse of
-/// buffers between queries, to avoid frequent allocations.
-///
-/// The buffer pool can be used for allocating any large buffers (above a configurable
-/// minimum length), whether or not the buffers will be spilled. Smaller allocations
-/// are not serviced directly by the buffer pool: clients of the buffer pool must
-/// subdivide buffers if they wish to use smaller allocations.
-///
-/// All buffer pool operations are in the context of a registered buffer pool client.
-/// A buffer pool client should be created for every allocator of buffers at the level
-/// of granularity required for reporting and enforcement of reservations, e.g. an
-/// operator. The client tracks buffer reservations via its ReservationTracker and also
-/// includes info that is helpful for debugging (e.g. the operator that is associated
-/// with the buffer). Unless otherwise noted, it is not safe to invoke concurrent buffer
-/// pool operations for the same client.
-///
-/// Pages, Buffers and Pinning
-/// ==========================
-/// * A page is a logical block of memory that can reside in memory or on disk.
-/// * A buffer is a physical block of memory that can hold a page in memory.
-/// * A page handle is used by buffer pool clients to identify and access a page and
-///   the corresponding buffer. Clients do not interact with pages directly.
-/// * A buffer handle is used by buffer pool clients to identify and access a buffer.
-/// * A page is pinned if it has pin count > 0. A pinned page stays mapped to the same
-///   buffer.
-/// * An unpinned page can be written out to disk by the buffer pool so that the buffer
-///   can be used for another purpose.
-///
-/// Buffer/Page Sizes
-/// =================
-/// The buffer pool has a minimum buffer size, which must be a power-of-two. Page and
-/// buffer sizes must be an exact power-of-two multiple of the minimum buffer size.
-///
-/// Reservations
-/// ============
-/// Before allocating buffers or pinning pages, a client must reserve memory through its
-/// ReservationTracker. Reservation of n bytes give a client the right to allocate
-/// buffers or pin pages summing up to n bytes. Reservations are both necessary and
-/// sufficient for a client to allocate buffers or pin pages: the operations succeed
-/// unless a "system error" such as a disk write error is encountered that prevents
-/// unpinned pages from being written to disk.
-///
-/// More memory may be reserved than is used, e.g. if a client is not using its full
-/// reservation. In such cases, the buffer pool can use the free buffers in any way,
-/// e.g. for keeping unpinned pages in memory, so long as it is able to fulfill the
-/// reservations when needed, e.g. by flushing unpinned pages to disk.
-///
-/// Page/Buffer Handles
-/// ===================
-/// The buffer pool exposes PageHandles and BufferHandles, which are owned by clients of
-/// the buffer pool, and act as a proxy for the internal data structure representing the
-/// page or buffer in the buffer pool. Handles are "open" if they are associated with a
-/// page or buffer. An open PageHandle is obtained by creating a page. PageHandles are
-/// closed by calling BufferPool::DestroyPage(). An open BufferHandle is obtained by
-/// allocating a buffer or extracting a BufferHandle from a PageHandle. The buffer of a
-/// pinned page can also be accessed through the PageHandle. The handle destructors check
-/// for resource leaks, e.g. an open handle that would result in a buffer leak.
-///
-/// Pin Counting of Page Handles:
-/// ----------------------------------
-/// Page handles are scoped to a client. The invariants are as follows:
-/// * A page can only be accessed through an open handle.
-/// * A page is destroyed once the handle is destroyed via DestroyPage().
-/// * A page's buffer can only be accessed through a pinned handle.
-/// * Pin() can be called on an open handle, incrementing the handle's pin count.
-/// * Unpin() can be called on a pinned handle, but not an unpinned handle.
-/// * Pin() always increases usage of reservations, and Unpin() always decreases usage,
-///   i.e. the handle consumes <pin count> * <page size> bytes of reservation.
-///
-/// Example Usage: Buffers
-/// ==================================
-/// The simplest use case is to allocate a memory buffer.
-/// * The new buffer is created with AllocateBuffer().
-/// * The client reads and writes to the buffer as it sees fit.
-/// * If the client is done with the buffer's contents it can call FreeBuffer() to
-///   destroy the handle and free the buffer, or use TransferBuffer() to transfer
-///   the buffer to a different client.
-///
-/// Example Usage: Spillable Pages
-/// ==============================
-/// * In order to spill pages to disk, the Client must be registered with a FileGroup,
-///   which is used to allocate scratch space on disk.
-/// * A spilling operator creates a new page with CreatePage().
-/// * The client reads and writes to the page's buffer as it sees fit.
-/// * If the operator encounters memory pressure, it can decrease reservation usage by
-///   calling Unpin() on the page. The page may then be written to disk and its buffer
-///   repurposed internally by BufferPool.
-/// * Once the operator needs the page's contents again and has sufficient unused
-///   reservation, it can call Pin(), which brings the page's contents back into memory,
-///   perhaps in a different buffer. Therefore the operator must fix up any pointers into
-///   the previous buffer. Pin() executes asynchronously - the caller only blocks waiting
-///   for read I/O if it calls GetBuffer() or ExtractBuffer() while the read is in
-///   flight.
-/// * If the operator is done with the page, it can call DestroyPage() to destroy the
-///   handle and release resources, or call ExtractBuffer() to extract the buffer.
-///
-/// Synchronization
-/// ===============
-/// The data structures in the buffer pool itself are thread-safe. Client-owned data
-/// structures - Client, PageHandle and BufferHandle - are not protected from concurrent
-/// accesses. Clients must ensure that they do not invoke concurrent operations with the
-/// same Client, PageHandle or BufferHandle.
-class BufferPool : public CacheLineAligned {
-public:
-    struct BufferAllocator;
-    class BufferHandle;
-    class ClientHandle;
-    class PageHandle;
-    /// Constructs a new buffer pool.
-    /// 'min_buffer_len': the minimum buffer length for the pool. Must be a power of two.
-    /// 'buffer_bytes_limit': the maximum physical memory in bytes that can be used by the
-    ///     buffer pool. If 'buffer_bytes_limit' is not a multiple of 'min_buffer_len', the
-    ///     remainder will not be usable.
-    /// 'clean_page_bytes_limit': the maximum bytes of clean pages that will be retained by the
-    ///     buffer pool.
-    BufferPool(int64_t min_buffer_len, int64_t buffer_bytes_limit, int64_t clean_page_bytes_limit);
-    ~BufferPool();
-
-    /// Register a client. Returns an error status and does not register the client if the
-    /// arguments are invalid. 'name' is an arbitrary name used to identify the client in
-    /// any errors messages or logging. If 'file_group' is non-nullptr, it is used to allocate
-    /// scratch space to write unpinned pages to disk. If it is nullptr, unpinning of pages is
-    /// not allowed for this client. Counters for this client are added to the (non-nullptr)
-    /// 'profile'. 'client' is the client to register. 'client' must not already be
-    /// registered.
-    Status RegisterClient(const std::string& name, RuntimeProfile* profile,
-                          ClientHandle* client) WARN_UNUSED_RESULT;
-
-    /// Deregister 'client' if it is registered. All pages must be destroyed and buffers
-    /// must be freed for the client before calling this. Releases any reservation that
-    /// belongs to the client. Idempotent.
-    void DeregisterClient(ClientHandle* client);
-
-    /// Create a new page of 'len' bytes with pin count 1. 'len' must be a page length
-    /// supported by BufferPool (see BufferPool class comment). The client must have
-    /// sufficient unused reservation to pin the new page (otherwise it will DCHECK).
-    /// CreatePage() only fails when a system error prevents the buffer pool from fulfilling
-    /// the reservation.
-    /// On success, the handle is mapped to the new page and 'buffer', if non-nullptr, is set
-    /// to the page's buffer.
-    Status CreatePage(ClientHandle* client, int64_t len, PageHandle* handle,
-                      const BufferHandle** buffer = nullptr) WARN_UNUSED_RESULT;
-
-    /// Increment the pin count of 'handle'. After Pin() the underlying page will
-    /// be mapped to a buffer, which will be accessible through 'handle'. If the data
-    /// was evicted from memory, it will be read back into memory asynchronously.
-    /// Attempting to access the buffer with ExtractBuffer() or handle.GetBuffer() will
-    /// block until the data is in memory. The caller is responsible for ensuring it has
-    /// enough unused reservation before calling Pin() (otherwise it will DCHECK). Pin()
-    /// only fails when a system error prevents the buffer pool from fulfilling the
-    /// reservation or if an I/O error is encountered reading back data from disk.
-    /// 'handle' must be open.
-    Status Pin(ClientHandle* client, PageHandle* handle) WARN_UNUSED_RESULT;
-
-    /// Decrement the pin count of 'handle'. Decrease client's reservation usage. If the
-    /// handle's pin count becomes zero, it is no longer valid for the underlying page's
-    /// buffer to be accessed via 'handle'. If the page's total pin count across all
-    /// handles that reference it goes to zero, the page's data may be written to disk and
-    /// the buffer reclaimed. 'handle' must be open and have a pin count > 0.
-    ///
-    /// It is an error to reduce the pin count to 0 if 'client' does not have an associated
-    /// FileGroup.
-    void Unpin(ClientHandle* client, PageHandle* handle);
-
-    /// Destroy the page referenced by 'handle' (if 'handle' is open). Any buffers or disk
-    /// storage backing the page are freed. Idempotent. If the page is pinned, the
-    /// reservation usage is decreased accordingly.
-    void DestroyPage(ClientHandle* client, PageHandle* handle);
-
-    /// Extracts buffer from a pinned page. After this returns, the page referenced by
-    /// 'page_handle' will be destroyed and 'buffer_handle' will reference the buffer from
-    /// 'page_handle'. This may decrease reservation usage of 'client' if the page was
-    /// pinned multiple times via 'page_handle'. May return an error if 'page_handle' was
-    /// unpinned earlier with no subsequent GetBuffer() call and a read error is
-    /// encountered while bringing the page back into memory.
-    Status ExtractBuffer(ClientHandle* client, PageHandle* page_handle,
-                         BufferHandle* buffer_handle) WARN_UNUSED_RESULT;
-
-    /// Allocates a new buffer of 'len' bytes. Uses reservation from 'client'. The caller
-    /// is responsible for ensuring it has enough unused reservation before calling
-    /// AllocateBuffer() (otherwise it will DCHECK). AllocateBuffer() only fails when
-    /// a system error prevents the buffer pool from fulfilling the reservation.
-    Status AllocateBuffer(ClientHandle* client, int64_t len,
-                          BufferHandle* handle) WARN_UNUSED_RESULT;
-
-    /// If 'handle' is open, close 'handle', free the buffer and decrease the reservation
-    /// usage from 'client'. Idempotent. Safe to call concurrently with any other
-    /// operations for 'client'.
-    void FreeBuffer(ClientHandle* client, BufferHandle* handle);
-
-    /// Transfer ownership of buffer from 'src_client' to 'dst_client' and move the
-    /// handle from 'src' to 'dst'. Increases reservation usage in 'dst_client' and
-    /// decreases reservation usage in 'src_client'. 'src' must be open and 'dst' must be
-    /// closed before calling. 'src'/'dst' and 'src_client'/'dst_client' must be different.
-    /// After a successful call, 'src' is closed and 'dst' is open. Safe to call
-    /// concurrently with any other operations for 'src_client'.
-    Status TransferBuffer(ClientHandle* src_client, BufferHandle* src, ClientHandle* dst_client,
-                          BufferHandle* dst) WARN_UNUSED_RESULT;
-
-    /// Try to release at least 'bytes_to_free' bytes of memory to the system allocator.
-    /// TODO: once IMPALA-4834 is done and all large allocations are served from the buffer
-    /// pool, this may not be necessary.
-    void ReleaseMemory(int64_t bytes_to_free);
-
-    /// Called periodically by a maintenance thread to release unused memory back to the
-    /// system allocator.
-    void Maintenance();
-
-    /// Print a debug string with the state of the buffer pool.
-    std::string DebugString();
-
-    int64_t min_buffer_len() const { return min_buffer_len_; }
-    int64_t GetSystemBytesLimit() const;
-    int64_t GetSystemBytesAllocated() const;
-
-    /// Return the limit on bytes of clean pages in the pool.
-    int64_t GetCleanPageBytesLimit() const;
-
-    /// Return the total number of clean pages in the pool.
-    int64_t GetNumCleanPages() const;
-
-    /// Return the total bytes of clean pages in the pool.
-    int64_t GetCleanPageBytes() const;
-
-    /// Return the total number of free buffers in the pool.
-    int64_t GetNumFreeBuffers() const;
-
-    /// Return the total bytes of free buffers in the pool.
-    int64_t GetFreeBufferBytes() const;
-
-    /// Generous upper bounds on page and buffer size and the number of different
-    /// power-of-two buffer sizes.
-    static constexpr int LOG_MAX_BUFFER_BYTES = 48;
-    static constexpr int64_t MAX_BUFFER_BYTES = 1L << LOG_MAX_BUFFER_BYTES;
-
-protected:
-    friend class BufferPoolTest;
-    /// Test helper: get a reference to the allocator.
-    BufferAllocator* allocator() { return allocator_.get(); }
-
-private:
-    DISALLOW_COPY_AND_ASSIGN(BufferPool);
-    class Client;
-    class FreeBufferArena;
-    class PageList;
-    class Page;
-
-    /// Allocator for allocating and freeing all buffer memory and managing lists of free
-    /// buffers and clean pages.
-    std::unique_ptr<BufferAllocator> allocator_;
-
-    /// The minimum length of a buffer in bytes. All buffers and pages are a power-of-two
-    /// multiple of this length. This is always a power of two.
-    const int64_t min_buffer_len_;
-};
-
-/// External representation of a client of the BufferPool. Clients are used for
-/// reservation accounting, and will be used in the future for tracking per-client
-/// buffer pool counters. This class is the external handle for a client so
-/// each Client instance is owned by the BufferPool's client, rather than the BufferPool.
-/// Each Client should only be used by a single thread at a time: concurrently calling
-/// Client methods or BufferPool methods with the Client as an argument is not supported.
-class BufferPool::ClientHandle {
-public:
-    ClientHandle() : impl_(nullptr) {}
-    /// Client must be deregistered.
-    ~ClientHandle() { DCHECK(!is_registered()); }
-
-    bool is_registered() const { return impl_ != nullptr; }
-
-    /// Return true if there are any unpinned pages for this client.
-    bool has_unpinned_pages() const;
-
-    std::string DebugString() const;
-
-private:
-    friend class BufferPool;
-    friend class BufferPoolTest;
-    DISALLOW_COPY_AND_ASSIGN(ClientHandle);
-
-    /// Internal state for the client. nullptr means the client isn't registered.
-    /// Owned by BufferPool.
-    Client* impl_;
-};
-
-/// A handle to a buffer allocated from the buffer pool. Each BufferHandle should only
-/// be used by a single thread at a time: concurrently calling BufferHandle methods or
-/// BufferPool methods with the BufferHandle as an argument is not supported.
-class BufferPool::BufferHandle {
-public:
-    BufferHandle() { Reset(); }
-    ~BufferHandle() { DCHECK(!is_open()); }
-
-    /// Allow move construction of handles to support std::move(). Inline to make moving
-    /// efficient.
-    BufferHandle(BufferHandle&& src);
-
-    /// Allow move assignment of handles to support STL classes like std::vector.
-    /// Destination must be uninitialized. Inline to make moving efficient.
-    BufferHandle& operator=(BufferHandle&& src);
-
-    bool is_open() const { return data_ != nullptr; }
-    int64_t len() const {
-        DCHECK(is_open());
-        return len_;
-    }
-    /// Get a pointer to the start of the buffer.
-    uint8_t* data() const {
-        DCHECK(is_open());
-        return data_;
-    }
-
-    MemRange mem_range() const { return MemRange(data(), len()); }
-
-    std::string DebugString() const;
-
-    /// Poison the memory associated with this handle. If ASAN is not enabled, this is a
-    /// no-op.
-    void Poison() { ASAN_POISON_MEMORY_REGION(data(), len()); }
-
-    /// Unpoison the memory associated with this handle. If ASAN is not enabled, this is a
-    /// no-op.
-    void Unpoison() { ASAN_UNPOISON_MEMORY_REGION(data(), len()); }
-
-private:
-    DISALLOW_COPY_AND_ASSIGN(BufferHandle);
-    friend class BufferPool;
-    friend class SystemAllocator;
-
-    /// Internal helper to set the handle to an opened state.
-    void Open(uint8_t* data, int64_t len, int home_core);
-
-    /// Internal helper to reset the handle to an unopened state. Inlined to make moving
-    /// efficient.
-    void Reset();
-
-    /// The client the buffer handle belongs to, used to validate that the correct client
-    /// is provided in BufferPool method calls. Set to nullptr if the buffer is in a free list.
-    const ClientHandle* client_;
-
-    /// Pointer to the start of the buffer. Non-nullptr if open, nullptr if closed.
-    uint8_t* data_;
-
-    /// Length of the buffer in bytes.
-    int64_t len_;
-
-    /// The CPU core that the buffer was allocated from - used to determine which arena
-    /// it will be added to.
-    int home_core_;
-};
-
-/// The handle for a page used by clients of the BufferPool. Each PageHandle should
-/// only be used by a single thread at a time: concurrently calling PageHandle methods
-/// or BufferPool methods with the PageHandle as an argument is not supported.
-class BufferPool::PageHandle {
-public:
-    PageHandle();
-    ~PageHandle() { DCHECK(!is_open()); }
-
-    // Allow move construction of page handles, to support std::move().
-    PageHandle(PageHandle&& src);
-
-    // Allow move assignment of page handles, to support STL classes like std::vector.
-    // Destination must be closed.
-    PageHandle& operator=(PageHandle&& src);
-
-    bool is_open() const { return page_ != nullptr; }
-    bool is_pinned() const { return pin_count() > 0; }
-    int pin_count() const;
-    int64_t len() const;
-
-    /// Get a reference to the page's buffer handle. Only valid to call if the page is
-    /// pinned. If the page was previously unpinned and the read I/O for the data is still
-    /// in flight, this can block waiting. Returns an error if an error was encountered
-    /// reading the data back, which can only happen if Unpin() was called on the page
-    /// since the last call to GetBuffer(). Only const accessors of the returned handle can
-    /// be used: it is invalid to call FreeBuffer() or TransferBuffer() on it or to
-    /// otherwise modify the handle.
-    Status GetBuffer(const BufferHandle** buffer_handle) const WARN_UNUSED_RESULT;
-
-    std::string DebugString() const;
-
-private:
-    DISALLOW_COPY_AND_ASSIGN(PageHandle);
-    friend class BufferPool;
-    friend class BufferPoolTest;
-    friend class Page;
-
-    /// Internal helper to open the handle for the given page.
-    void Open(Page* page, ClientHandle* client);
-
-    /// Internal helper to reset the handle to an unopened state.
-    void Reset();
-
-    /// The internal page structure. nullptr if the handle is not open.
-    Page* page_;
-
-    /// The client the page handle belongs to.
-    ClientHandle* client_;
-};
-
-inline BufferPool::BufferHandle::BufferHandle(BufferHandle&& src) {
-    Reset();
-    *this = std::move(src);
-}
-
-inline BufferPool::BufferHandle& BufferPool::BufferHandle::operator=(BufferHandle&& src) {
-    DCHECK(!is_open());
-    // Copy over all members then close src.
-    client_ = src.client_;
-    data_ = src.data_;
-    len_ = src.len_;
-    home_core_ = src.home_core_;
-    src.Reset();
-    return *this;
-}
-
-inline void BufferPool::BufferHandle::Reset() {
-    client_ = nullptr;
-    data_ = nullptr;
-    len_ = -1;
-    home_core_ = -1;
-}
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/buffer_pool_counters.h b/be/src/runtime/bufferpool/buffer_pool_counters.h
deleted file mode 100644
index 7e3ccb79bc..0000000000
--- a/be/src/runtime/bufferpool/buffer_pool_counters.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef DORIS_BE_RUNTIME_BUFFER_POOL_COUNTERS_H
-#define DORIS_BE_RUNTIME_BUFFER_POOL_COUNTERS_H
-
-#include "util/runtime_profile.h"
-
-namespace doris {
-
-/// A set of counters for each buffer pool client.
-struct BufferPoolClientCounters {
-public:
-    /// Total amount of time spent inside BufferAllocator::AllocateBuffer().
-    RuntimeProfile::Counter* alloc_time;
-
-    /// Number of buffers allocated via BufferAllocator::AllocateBuffer().
-    RuntimeProfile::Counter* cumulative_allocations;
-
-    /// Bytes of buffers allocated via BufferAllocator::AllocateBuffer().
-    RuntimeProfile::Counter* cumulative_bytes_alloced;
-
-    /// The peak total size of unpinned pages.
-    RuntimeProfile::HighWaterMarkCounter* peak_unpinned_bytes;
-};
-
-} // namespace doris
-
-#endif
diff --git a/be/src/runtime/bufferpool/buffer_pool_internal.h b/be/src/runtime/bufferpool/buffer_pool_internal.h
deleted file mode 100644
index 2b0a083268..0000000000
--- a/be/src/runtime/bufferpool/buffer_pool_internal.h
+++ /dev/null
@@ -1,299 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-#include <mutex>
-#include <sstream>
-
-#include "runtime/bufferpool/buffer_pool.h"
-#include "runtime/bufferpool/buffer_pool_counters.h"
-
-// Ensure that DCheckConsistency() function calls get removed in release builds.
-#ifndef NDEBUG
-#define DCHECK_CONSISTENCY() DCheckConsistency()
-#else
-#define DCHECK_CONSISTENCY()
-#endif
-
-namespace doris {
-
-/// The internal representation of a page, which can be pinned or unpinned. See the
-/// class comment for explanation of the different page states.
-class BufferPool::Page : public InternalList<Page>::Node {
-public:
-    Page(Client* client, int64_t len)
-            : client(client), len(len), pin_count(0), pin_in_flight(false) {}
-
-    std::string DebugString();
-
-    // Helper for BufferPool::DebugString().
-    static bool DebugStringCallback(std::stringstream* ss, BufferPool::Page* page);
-
-    /// The client that the page belongs to.
-    Client* const client;
-
-    /// The length of the page in bytes.
-    const int64_t len;
-
-    /// The pin count of the page. Only accessed in contexts that are passed the associated
-    /// PageHandle, so it cannot be accessed by multiple threads concurrently.
-    int pin_count;
-
-    /// True if the read I/O to pin the page was started but not completed. Only accessed
-    /// in contexts that are passed the associated PageHandle, so it cannot be accessed
-    /// by multiple threads concurrently.
-    bool pin_in_flight;
-
-    /// Non-null if there is a write in flight, the page is clean, or the page is evicted.
-    //std::unique_ptr<TmpFileMgr::WriteHandle> write_handle;
-
-    /// This lock must be held when accessing 'buffer' if the page is unpinned and not
-    /// evicted (i.e. it is safe to access 'buffer' if the page is pinned or evicted).
-    SpinLock buffer_lock;
-
-    /// Buffer with the page's contents. Closed only iff page is evicted. Open otherwise.
-    BufferHandle buffer;
-};
-
-/// Wrapper around InternalList<Page> that tracks the # of bytes in the list.
-class BufferPool::PageList {
-public:
-    PageList() : bytes_(0) {}
-    ~PageList() {
-        // Clients always empty out their list before destruction.
-        DCHECK(list_.empty());
-        DCHECK_EQ(0, bytes_);
-    }
-
-    void enqueue(Page* page) {
-        list_.enqueue(page);
-        bytes_ += page->len;
-    }
-
-    bool remove(Page* page) {
-        if (list_.remove(page)) {
-            bytes_ -= page->len;
-            return true;
-        }
-        return false;
-    }
-
-    Page* dequeue() {
-        Page* page = list_.dequeue();
-        if (page != nullptr) {
-            bytes_ -= page->len;
-        }
-        return page;
-    }
-
-    Page* pop_back() {
-        Page* page = list_.pop_back();
-        if (page != nullptr) {
-            bytes_ -= page->len;
-        }
-        return page;
-    }
-
-    void iterate(std::function<bool(Page*)> fn) { list_.iterate(fn); }
-    bool contains(Page* page) { return list_.contains(page); }
-    Page* tail() { return list_.tail(); }
-    bool empty() const { return list_.empty(); }
-    int size() const { return list_.size(); }
-    int64_t bytes() const { return bytes_; }
-
-    void DCheckConsistency() {
-        DCHECK_GE(bytes_, 0);
-        DCHECK_EQ(list_.empty(), bytes_ == 0);
-    }
-
-private:
-    InternalList<Page> list_;
-    int64_t bytes_;
-};
-
-/// The internal state for the client.
-class BufferPool::Client {
-public:
-    Client(BufferPool* pool, //TmpFileMgr::FileGroup* file_group,
-           const std::string& name, RuntimeProfile* profile);
-
-    ~Client() {
-        DCHECK_EQ(0, num_pages_);
-        DCHECK_EQ(0, buffers_allocated_bytes_);
-    }
-
-    void Close() {}
-
-    /// Create a pinned page using 'buffer', which was allocated using AllocateBuffer().
-    /// No client or page locks should be held by the caller.
-    Page* CreatePinnedPage(BufferHandle&& buffer);
-
-    /// Reset 'handle', clean up references to handle->page and release any resources
-    /// associated with handle->page. If the page is pinned, 'out_buffer' can be passed in
-    /// and the page's buffer will be returned.
-    /// Neither the client's lock nor handle->page_->buffer_lock should be held by the
-    /// caller.
-    void DestroyPageInternal(PageHandle* handle, BufferHandle* out_buffer = nullptr);
-
-    /// Updates client state to reflect that 'page' is now a dirty unpinned page. May
-    /// initiate writes for this or other dirty unpinned pages.
-    /// Neither the client's lock nor page->buffer_lock should be held by the caller.
-    void MoveToDirtyUnpinned(Page* page);
-
-    /// Move an unpinned page to the pinned state, moving between data structures and
-    /// reading from disk if necessary. Ensures the page has a buffer. If the data is
-    /// already in memory, ensures the data is in the page's buffer. If the data is on
-    /// disk, starts an async read of the data and sets 'pin_in_flight' on the page to
-    /// true. Neither the client's lock nor page->buffer_lock should be held by the caller.
-    Status StartMoveToPinned(ClientHandle* client, Page* page) WARN_UNUSED_RESULT;
-
-    /// Moves a page that has a pin in flight back to the evicted state, undoing
-    /// StartMoveToPinned(). Neither the client's lock nor page->buffer_lock should be held
-    /// by the caller.
-    //void UndoMoveEvictedToPinned(Page* page);
-
-    /// Finish the work of bring the data of an evicted page to memory if
-    /// page->pin_in_flight was set to true by StartMoveToPinned().
-    //Status FinishMoveEvictedToPinned(Page* page) WARN_UNUSED_RESULT;
-
-    /// Must be called once before allocating a buffer of 'len' via the AllocateBuffer()
-    /// API to deduct from the client's reservation and update internal accounting. Cleans
-    /// dirty pages if needed to satisfy the buffer pool's internal invariants. No page or
-    /// client locks should be held by the caller.
-    Status PrepareToAllocateBuffer(int64_t len) WARN_UNUSED_RESULT;
-
-    /// Called after a buffer of 'len' is freed via the FreeBuffer() API to update
-    /// internal accounting and release the buffer to the client's reservation. No page or
-    /// client locks should be held by the caller.
-    void FreedBuffer(int64_t len) {
-        std::lock_guard<std::mutex> cl(lock_);
-        buffers_allocated_bytes_ -= len;
-        DCHECK_CONSISTENCY();
-    }
-
-    /// Wait for the in-flight write for 'page' to complete.
-    /// 'lock_' must be held by the caller via 'client_lock'. page->buffer_lock should
-    /// not be held.
-    //void WaitForWrite(std::unique_lock<std::mutex>* client_lock, Page* page);
-
-    /// Test helper: wait for all in-flight writes to complete.
-    /// 'lock_' must not be held by the caller.
-    //void WaitForAllWrites();
-
-    /// Asserts that 'client_lock' is holding 'lock_'.
-    void DCheckHoldsLock(const std::unique_lock<std::mutex>& client_lock) {
-        DCHECK(client_lock.mutex() == &lock_ && client_lock.owns_lock());
-    }
-
-    const BufferPoolClientCounters& counters() const { return counters_; }
-    //bool spilling_enabled() const { return file_group_ != nullptr; }
-    void set_debug_write_delay_ms(int val) { debug_write_delay_ms_ = val; }
-    bool has_unpinned_pages() const {
-        // Safe to read without lock since other threads should not be calling BufferPool
-        // functions that create, destroy or unpin pages.
-        return pinned_pages_.size() < num_pages_;
-    }
-
-    std::string DebugString();
-
-private:
-    // Check consistency of client, DCHECK if inconsistent. 'lock_' must be held.
-    void DCheckConsistency() {
-        DCHECK_GE(buffers_allocated_bytes_, 0);
-        pinned_pages_.DCheckConsistency();
-        dirty_unpinned_pages_.DCheckConsistency();
-        in_flight_write_pages_.DCheckConsistency();
-        DCHECK_LE(
-                pinned_pages_.size() + dirty_unpinned_pages_.size() + in_flight_write_pages_.size(),
-                num_pages_);
-    }
-
-    /// Must be called once before allocating or reclaiming a buffer of 'len'. Ensures that
-    /// enough dirty pages are flushed to disk to satisfy the buffer pool's internal
-    /// invariants after the allocation. 'lock_' should be held by the caller via
-    /// 'client_lock'
-    Status CleanPages(std::unique_lock<std::mutex>* client_lock, int64_t len);
-
-    /// Initiates asynchronous writes of dirty unpinned pages to disk. Ensures that at
-    /// least 'min_bytes_to_write' bytes of writes will be written asynchronously. May
-    /// start writes more aggressively so that I/O and compute can be overlapped. If
-    /// any errors are encountered, 'write_status_' is set. 'write_status_' must therefore
-    /// be checked before reading back any pages. 'lock_' must be held by the caller.
-    //void WriteDirtyPagesAsync(int64_t min_bytes_to_write = 0);
-
-    /// Called when a write for 'page' completes.
-    //void WriteCompleteCallback(Page* page, const Status& write_status);
-
-    /// Move an evicted page to the pinned state by allocating a new buffer, starting an
-    /// async read from disk and moving the page to 'pinned_pages_'. client->impl must be
-    /// locked by the caller via 'client_lock' and handle->page must be unlocked.
-    /// 'client_lock' is released then reacquired.
-    //Status StartMoveEvictedToPinned(
-    //    std::unique_lock<std::mutex>* client_lock, ClientHandle* client, Page* page);
-
-    /// The buffer pool that owns the client.
-    BufferPool* const pool_;
-
-    /// The file group that should be used for allocating scratch space. If nullptr, spilling
-    /// is disabled.
-    //TmpFileMgr::FileGroup* const file_group_;
-
-    /// A name identifying the client.
-    const std::string name_;
-
-    /// The RuntimeProfile counters for this client, owned by the client's RuntimeProfile.
-    /// All non-nullptr.
-    BufferPoolClientCounters counters_;
-
-    /// Debug option to delay write completion.
-    int debug_write_delay_ms_;
-
-    /// Lock to protect the below member variables;
-    std::mutex lock_;
-
-    /// All non-OK statuses returned by write operations are merged into this status.
-    /// All operations that depend on pages being written to disk successfully (e.g.
-    /// reading pages back from disk) must check 'write_status_' before proceeding, so
-    /// that write errors that occurred asynchronously are correctly propagated. The
-    /// write error is global to the client so can be propagated to any Status-returning
-    /// operation for the client (even for operations on different Pages or Buffers).
-    /// Write errors are not recoverable so it is best to propagate them as quickly
-    /// as possible, instead of waiting to propagate them in a specific way.
-    Status write_status_;
-
-    /// Total number of pages for this client. Used for debugging and enforcing that all
-    /// pages are destroyed before the client.
-    int64_t num_pages_;
-
-    /// Total bytes of buffers in BufferHandles returned to clients (i.e. obtained from
-    /// AllocateBuffer() or ExtractBuffer()).
-    int64_t buffers_allocated_bytes_;
-
-    /// All pinned pages for this client.
-    PageList pinned_pages_;
-
-    /// Dirty unpinned pages for this client for which writes are not in flight. Page
-    /// writes are started in LIFO order, because operators typically have sequential access
-    /// patterns where the most recently evicted page will be last to be read.
-    PageList dirty_unpinned_pages_;
-
-    /// Dirty unpinned pages for this client for which writes are in flight.
-    PageList in_flight_write_pages_;
-};
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/free_list.h b/be/src/runtime/bufferpool/free_list.h
deleted file mode 100644
index 7121033de8..0000000000
--- a/be/src/runtime/bufferpool/free_list.h
+++ /dev/null
@@ -1,115 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cstdint>
-#include <vector>
-
-#include "common/logging.h"
-#include "gutil/macros.h"
-#include "runtime/bufferpool/buffer_pool.h"
-
-namespace doris {
-
-using BufferHandle = BufferPool::BufferHandle;
-
-/// A non-threadsafe list of free buffers.
-///
-/// Buffers are allocated by the caller and can be added to the list for later retrieval
-/// with AddFreeBuffer(). If the list is non-empty, calling PopFreeBuffer() will return
-/// one of the buffers previously added to the list. FreeList is agnostic about the size
-/// or other properties of the buffers added to it.
-///
-/// Buffers in the list can be freed at any point, e.g. if the list is storing too many
-/// free buffers (according to some policy). The caller is responsible for implementing
-/// the policy and calling FreeBuffers() or FreeAll() at the appropriate times.
-///
-/// Address space fragmentation
-/// ---------------------------
-/// To reduce memory fragmentation, the free list hands out buffers with lower memory
-/// addresses first and frees buffers with higher memory address first. If buffers were
-/// handed out by a policy that didn't take memory address into account, over time the
-/// distribution of free buffers within the address space would become essentially
-/// random. If free buffers were then unmapped, there would be many holes in the virtual
-/// memory map, which can cause difficulties for the OS in some cases, e.g. exceeding the
-/// maximum number of mmapped() regions (vm.max_map_count) in Linux. Using this approach
-/// will tend to consolidate free buffers in higher parts of the address space, allowing
-/// coalescing of the holes in most cases.
-class FreeList {
-public:
-    FreeList() {}
-
-    /// Gets a free buffer. If the list is non-empty, returns true and sets 'buffer' to
-    /// one of the buffers previously added with AddFreeBuffer(). Otherwise returns false.
-    bool PopFreeBuffer(BufferHandle* buffer) {
-        if (free_list_.empty()) return false;
-        std::pop_heap(free_list_.begin(), free_list_.end(), HeapCompare);
-        *buffer = std::move(free_list_.back());
-        free_list_.pop_back();
-        return true;
-    }
-
-    /// Adds a free buffer to the list.
-    void AddFreeBuffer(BufferHandle&& buffer) {
-        buffer.Poison();
-        free_list_.emplace_back(std::move(buffer));
-        std::push_heap(free_list_.begin(), free_list_.end(), HeapCompare);
-    }
-
-    /// Get the 'num_buffers' buffers with the highest memory address from the list to
-    /// free. The average time complexity is n log n, where n is the current size of the
-    /// list.
-    std::vector<BufferHandle> GetBuffersToFree(int64_t num_buffers) {
-        std::vector<BufferHandle> buffers;
-        DCHECK_LE(num_buffers, free_list_.size());
-        // Sort the list so we can free the buffers with higher memory addresses.
-        // Note that the sorted list is still a valid min-heap.
-        std::sort(free_list_.begin(), free_list_.end(), SortCompare);
-
-        for (int64_t i = 0; i < num_buffers; ++i) {
-            buffers.emplace_back(std::move(free_list_.back()));
-            free_list_.pop_back();
-        }
-        return buffers;
-    }
-
-    /// Returns the number of buffers currently in the list.
-    int64_t Size() const { return free_list_.size(); }
-
-private:
-    friend class FreeListTest;
-
-    DISALLOW_COPY_AND_ASSIGN(FreeList);
-
-    /// Compare function that orders by memory address.
-    static bool SortCompare(const BufferHandle& b1, const BufferHandle& b2) {
-        return b1.data() < b2.data();
-    }
-
-    /// Compare function that orders by memory address. Needs to be inverse of SortCompare()
-    /// because C++ provides a max-heap.
-    static bool HeapCompare(const BufferHandle& b1, const BufferHandle& b2) {
-        return SortCompare(b2, b1);
-    }
-
-    /// List of free memory buffers. Maintained as a min-heap ordered by the memory address
-    /// of the buffer.
-    std::vector<BufferHandle> free_list_;
-};
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/suballocator.cc b/be/src/runtime/bufferpool/suballocator.cc
deleted file mode 100644
index f26aee6205..0000000000
--- a/be/src/runtime/bufferpool/suballocator.cc
+++ /dev/null
@@ -1,252 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/bufferpool/suballocator.h"
-
-#include <new>
-
-#include "gutil/strings/substitute.h"
-#include "util/bit_util.h"
-
-namespace doris {
-
-constexpr int Suballocator::LOG_MAX_ALLOCATION_BYTES;
-constexpr int64_t Suballocator::MAX_ALLOCATION_BYTES;
-constexpr int Suballocator::LOG_MIN_ALLOCATION_BYTES;
-constexpr int64_t Suballocator::MIN_ALLOCATION_BYTES;
-//const int Suballocator::NUM_FREE_LISTS;
-
-Suballocator::Suballocator(BufferPool* pool, BufferPool::ClientHandle* client,
-                           int64_t min_buffer_len)
-        : pool_(pool), client_(client), min_buffer_len_(min_buffer_len), allocated_(0) {}
-
-Suballocator::~Suballocator() {
-    // All allocations should be free and buffers deallocated.
-    DCHECK_EQ(allocated_, 0);
-    for (int i = 0; i < NUM_FREE_LISTS; ++i) {
-        DCHECK(free_lists_[i] == nullptr);
-    }
-}
-
-Status Suballocator::Allocate(int64_t bytes, std::unique_ptr<Suballocation>* result) {
-    DCHECK_GE(bytes, 0);
-    if (UNLIKELY(bytes > MAX_ALLOCATION_BYTES)) {
-        std::stringstream err_stream;
-        err_stream << "Requested memory allocation of " << bytes << " bytes, larger than std::max "
-                   << "supported of " << MAX_ALLOCATION_BYTES << " bytes";
-        return Status::InternalError(err_stream.str());
-    }
-    std::unique_ptr<Suballocation> free_node;
-    bytes = std::max(bytes, MIN_ALLOCATION_BYTES);
-    const int target_list_idx = ComputeListIndex(bytes);
-    for (int i = target_list_idx; i < NUM_FREE_LISTS; ++i) {
-        free_node = PopFreeListHead(i);
-        if (free_node != nullptr) break;
-    }
-
-    if (free_node == nullptr) {
-        // Unable to find free allocation, need to get more memory from buffer pool.
-        RETURN_IF_ERROR(AllocateBuffer(bytes, &free_node));
-        if (free_node == nullptr) {
-            *result = nullptr;
-            return Status::OK();
-        }
-    }
-
-    // Free node may be larger than required.
-    const int free_list_idx = ComputeListIndex(free_node->len_);
-    if (free_list_idx != target_list_idx) {
-        RETURN_IF_ERROR(SplitToSize(std::move(free_node), bytes, &free_node));
-        DCHECK(free_node != nullptr);
-    }
-
-    free_node->in_use_ = true;
-    allocated_ += free_node->len_;
-    *result = std::move(free_node);
-    return Status::OK();
-}
-
-int Suballocator::ComputeListIndex(int64_t bytes) const {
-    return BitUtil::Log2CeilingNonZero64(bytes) - LOG_MIN_ALLOCATION_BYTES;
-}
-
-uint64_t Suballocator::ComputeAllocateBufferSize(int64_t bytes) const {
-    bytes = std::max(bytes, MIN_ALLOCATION_BYTES);
-    const int target_list_idx = ComputeListIndex(bytes);
-    for (int i = target_list_idx; i < NUM_FREE_LISTS; ++i) {
-        if (CheckFreeListHeadNotNull(i)) return 0;
-    }
-    return std::max(min_buffer_len_, BitUtil::RoundUpToPowerOfTwo(bytes));
-}
-
-Status Suballocator::AllocateBuffer(int64_t bytes, std::unique_ptr<Suballocation>* result) {
-    DCHECK_LE(bytes, MAX_ALLOCATION_BYTES);
-    const int64_t buffer_len = std::max(min_buffer_len_, BitUtil::RoundUpToPowerOfTwo(bytes));
-
-    std::unique_ptr<Suballocation> free_node;
-    RETURN_IF_ERROR(Suballocation::Create(&free_node));
-    RETURN_IF_ERROR(pool_->AllocateBuffer(client_, buffer_len, &free_node->buffer_));
-
-    free_node->data_ = free_node->buffer_.data();
-    free_node->len_ = buffer_len;
-    *result = std::move(free_node);
-    return Status::OK();
-}
-
-Status Suballocator::SplitToSize(std::unique_ptr<Suballocation> free_node, int64_t target_bytes,
-                                 std::unique_ptr<Suballocation>* result) {
-    DCHECK(!free_node->in_use_);
-    DCHECK_GT(free_node->len_, target_bytes);
-
-    const int free_list_idx = ComputeListIndex(free_node->len_);
-    const int target_list_idx = ComputeListIndex(target_bytes);
-
-    // Preallocate nodes to avoid handling allocation failures during splitting.
-    // Need two nodes per level for the left and right children.
-    const int num_nodes = (free_list_idx - target_list_idx) * 2;
-    constexpr int MAX_NUM_NODES = NUM_FREE_LISTS * 2;
-    std::unique_ptr<Suballocation> nodes[MAX_NUM_NODES];
-    for (int i = 0; i < num_nodes; ++i) {
-        if (!Suballocation::Create(&nodes[i]).ok()) {
-            // Add the free node to the free list to restore the allocator to an internally
-            // consistent state.
-            AddToFreeList(std::move(free_node));
-            return Status::InternalError("Failed to allocate list node in Suballocator");
-        }
-    }
-
-    // Iteratively split from the current size down to the target size. We will return
-    // the leftmost descendant node.
-    int next_node = 0;
-    for (int i = free_list_idx; i > target_list_idx; --i) {
-        DCHECK_EQ(free_node->len_, 1LL << (i + LOG_MIN_ALLOCATION_BYTES));
-        std::unique_ptr<Suballocation> left_child = std::move(nodes[next_node++]);
-        std::unique_ptr<Suballocation> right_child = std::move(nodes[next_node++]);
-        DCHECK_LE(next_node, num_nodes);
-
-        const int64_t child_len = free_node->len_ / 2;
-        left_child->data_ = free_node->data_;
-        right_child->data_ = free_node->data_ + child_len;
-        left_child->len_ = right_child->len_ = child_len;
-        left_child->buddy_ = right_child.get();
-        right_child->buddy_ = left_child.get();
-        free_node->in_use_ = true;
-        left_child->parent_ = std::move(free_node);
-
-        AddToFreeList(std::move(right_child));
-        free_node = std::move(left_child);
-    }
-    *result = std::move(free_node);
-    return Status::OK();
-}
-
-uint64_t Suballocator::Free(std::unique_ptr<Suballocation> allocation) {
-    if (allocation == nullptr) return 0;
-
-    DCHECK(allocation->in_use_);
-    allocation->in_use_ = false;
-    allocated_ -= allocation->len_;
-
-    // Iteratively coalesce buddies until the buddy is in use or we get to the root.
-    // This ensures that all buddies in the free lists are coalesced. I.e. we do not
-    // have two buddies in the same free list.
-    std::unique_ptr<Suballocation> curr_allocation = std::move(allocation);
-    while (curr_allocation->buddy_ != nullptr) {
-        if (curr_allocation->buddy_->in_use_) {
-            // If the buddy is not free we can't coalesce, just add it to free list.
-            AddToFreeList(std::move(curr_allocation));
-            return 0;
-        }
-        std::unique_ptr<Suballocation> buddy = RemoveFromFreeList(curr_allocation->buddy_);
-        curr_allocation = CoalesceBuddies(std::move(curr_allocation), std::move(buddy));
-    }
-
-    // Reached root, which is an entire free buffer. We are not using it, so free up memory.
-    DCHECK(curr_allocation->buffer_.is_open());
-    auto free_len = curr_allocation->buffer_.len();
-    pool_->FreeBuffer(client_, &curr_allocation->buffer_);
-    curr_allocation.reset();
-    return free_len;
-}
-
-void Suballocator::AddToFreeList(std::unique_ptr<Suballocation> node) {
-    DCHECK(!node->in_use_);
-    int list_idx = ComputeListIndex(node->len_);
-    if (free_lists_[list_idx] != nullptr) {
-        free_lists_[list_idx]->prev_free_ = node.get();
-    }
-    node->next_free_ = std::move(free_lists_[list_idx]);
-    DCHECK(node->prev_free_ == nullptr);
-    free_lists_[list_idx] = std::move(node);
-}
-
-std::unique_ptr<Suballocation> Suballocator::RemoveFromFreeList(Suballocation* node) {
-    DCHECK(node != nullptr);
-    const int list_idx = ComputeListIndex(node->len_);
-
-    if (node->next_free_ != nullptr) {
-        node->next_free_->prev_free_ = node->prev_free_;
-    }
-
-    std::unique_ptr<Suballocation>* ptr_from_prev =
-            node->prev_free_ == nullptr ? &free_lists_[list_idx] : &node->prev_free_->next_free_;
-    node->prev_free_ = nullptr;
-    std::unique_ptr<Suballocation> result = std::move(*ptr_from_prev);
-    *ptr_from_prev = std::move(node->next_free_);
-    return result;
-}
-
-std::unique_ptr<Suballocation> Suballocator::PopFreeListHead(int list_idx) {
-    if (free_lists_[list_idx] == nullptr) return nullptr;
-    std::unique_ptr<Suballocation> result = std::move(free_lists_[list_idx]);
-    DCHECK(result->prev_free_ == nullptr);
-    if (result->next_free_ != nullptr) {
-        result->next_free_->prev_free_ = nullptr;
-    }
-    free_lists_[list_idx] = std::move(result->next_free_);
-    return result;
-}
-
-bool Suballocator::CheckFreeListHeadNotNull(int list_idx) const {
-    return free_lists_[list_idx] != nullptr;
-}
-
-std::unique_ptr<Suballocation> Suballocator::CoalesceBuddies(std::unique_ptr<Suballocation> b1,
-                                                             std::unique_ptr<Suballocation> b2) {
-    DCHECK(!b1->in_use_);
-    DCHECK(!b2->in_use_);
-    DCHECK_EQ(b1->buddy_, b2.get());
-    DCHECK_EQ(b2->buddy_, b1.get());
-    // Only the left child's parent should be present.
-    DCHECK((b1->parent_ != nullptr) ^ (b2->parent_ != nullptr));
-    std::unique_ptr<Suballocation> parent =
-            b1->parent_ != nullptr ? std::move(b1->parent_) : std::move(b2->parent_);
-    parent->in_use_ = false;
-    return parent;
-}
-
-Status Suballocation::Create(std::unique_ptr<Suballocation>* new_suballocation) {
-    // Allocate from system allocator for simplicity. We don't expect this to be
-    // performance critical or to be used for small allocations where CPU/memory
-    // overhead of these allocations might be a consideration.
-    new_suballocation->reset(new (std::nothrow) Suballocation());
-    if (*new_suballocation == nullptr) {
-        return Status::MemoryAllocFailed("allocate memory failed");
-    }
-    return Status::OK();
-}
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/suballocator.h b/be/src/runtime/bufferpool/suballocator.h
deleted file mode 100644
index eb5fd0d1fb..0000000000
--- a/be/src/runtime/bufferpool/suballocator.h
+++ /dev/null
@@ -1,221 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-
-#include "runtime/bufferpool/buffer_pool.h"
-
-namespace doris {
-
-class Suballocation;
-
-/// Helper class to subdivide buffers from the buffer pool. Implements a buddy
-/// allocation algorithm optimised for power-of-two allocations. At or above the
-/// 'min_buffer_len' value, each allocation is backed by a power-of-two buffer from
-/// a BufferPool. Below that threshold, each allocation is backed by a
-/// 'min_buffer_len' buffer split recursively into equal-sized buddies until the
-/// desired allocation size is reached. Every time an allocation is freed,
-/// free buddies are coalesced eagerly and whole buffers are freed eagerly.
-///
-/// The algorithms used are asymptotically efficient: O(log(max allocation size)), but
-/// the implementation's constant-factor overhead is not optimised. Thus, the allocator
-/// is best suited for relatively large allocations where the constant CPU/memory
-/// overhead per allocation is not paramount, e.g. bucket directories of hash tables.
-/// All allocations less than MIN_ALLOCATION_BYTES are rounded up to that amount.
-///
-/// Methods of Suballocator are not thread safe.
-///
-/// Implementation:
-/// ---------------
-/// The allocator uses two key data structures: a number of binary trees representing
-/// the buddy relationships between allocations and a set of free lists, one for each
-/// power-of-two size.
-///
-/// Each buffer allocated from the buffer pool has a tree of Suballocations associated
-/// with it that use the memory from that buffer. The root of the tree is the
-/// Suballocation corresponding to the entire buffer. Each node has either zero children
-/// (if it hasn't been split) or two children (if it has been split into two buddy
-/// allocations). Each non-root Suballocation has pointers to its buddy and its parent
-/// to enable coalescing the buddies into the parent when both are free.
-///
-/// Suballocations are eagerly coalesced when freed, so a Suballocation only has children
-/// if one of its descendants is allocated.
-///
-/// The free lists are doubly-linked lists of free Suballocation objects that support
-/// O(1) add and remove. The next and previous pointers are stored in the
-/// Suballocation object so no auxiliary memory is required.
-class Suballocator {
-public:
-    /// Constructs a suballocator that allocates memory from 'pool' with 'client'.
-    /// Suballocations smaller than 'min_buffer_len' are handled by allocating a
-    /// buffer of 'min_buffer_len' and recursively splitting it.
-    Suballocator(BufferPool* pool, BufferPool::ClientHandle* client, int64_t min_buffer_len);
-
-    ~Suballocator();
-    /// Compute how many mem will be allocated from BufferPool. We will use it to try
-    /// consume mem in BufferedBlockMgr.
-    uint64_t ComputeAllocateBufferSize(int64_t bytes) const;
-    /// Allocate bytes from BufferPool. The allocation is nullptr if unsuccessful because
-    /// the client's reservation was insufficient. If an unexpected error is encountered,
-    /// returns that status. The allocation size is rounded up to the next power-of-two.
-    /// The caller must always free the allocation by calling Free() (otherwise destructing
-    /// the returned 'result' will DCHECK on debug builds or otherwise misbehave on release
-    /// builds).
-    ///
-    /// Allocate() will try to increase the client's buffer pool reservation to fulfill
-    /// the requested allocation if needed.
-    ///
-    /// The memory returned is at least 8-byte aligned.
-    Status Allocate(int64_t bytes, std::unique_ptr<Suballocation>* result);
-
-    /// Free the allocation. Does nothing if allocation is nullptr (e.g. was the result of a
-    /// failed Allocate() call). Return how many really release in BufferPool, release mem in BufferedBlockMgr.
-    uint64_t Free(std::unique_ptr<Suballocation> allocation);
-
-    /// Upper bounds on the max allocation size and the number of different
-    /// power-of-two allocation sizes. Used to bound the number of free lists.
-    static constexpr int LOG_MAX_ALLOCATION_BYTES = BufferPool::LOG_MAX_BUFFER_BYTES;
-    static constexpr int64_t MAX_ALLOCATION_BYTES = BufferPool::MAX_BUFFER_BYTES;
-
-    /// Don't support allocations less than 4kb to avoid high overhead.
-    static constexpr int LOG_MIN_ALLOCATION_BYTES = 12;
-    static constexpr int64_t MIN_ALLOCATION_BYTES = 1L << LOG_MIN_ALLOCATION_BYTES;
-
-private:
-    DISALLOW_COPY_AND_ASSIGN(Suballocator);
-
-    /// Compute the index for allocations of size 'bytes' in 'free_lists_'. 'bytes' is
-    /// rounded up to the next power-of-two if it is not already a power-of-two.
-    int ComputeListIndex(int64_t bytes) const;
-
-    /// Allocate a buffer of size 'bytes' < MAX_ALLOCATION_BYTES from the buffer pool and
-    /// initialize 'result' with it. If the reservation is insufficient, try to increase
-    /// the reservation to fit.
-    Status AllocateBuffer(int64_t bytes, std::unique_ptr<Suballocation>* result);
-
-    /// Split the free allocation until we get an allocation of 'target_bytes' rounded up
-    /// to a power-of-two. This allocation is returned. The other allocations resulting
-    /// from the splits are added to free lists. node->in_use must be false and 'node'
-    /// must not be in any free list. Can fail if allocating memory for data structures
-    /// fails.
-    Status SplitToSize(std::unique_ptr<Suballocation> node, int64_t target_bytes,
-                       std::unique_ptr<Suballocation>* result);
-
-    // Add allocation to the free list with given index.
-    void AddToFreeList(std::unique_ptr<Suballocation> node);
-
-    // Remove allocation from its free list.
-    std::unique_ptr<Suballocation> RemoveFromFreeList(Suballocation* node);
-
-    // Get the allocation at the head of the free list at index 'list_idx'. Return nullptr
-    // if list is empty.
-    std::unique_ptr<Suballocation> PopFreeListHead(int list_idx);
-
-    // Check list_idx of Free List whether is nullptr
-    bool CheckFreeListHeadNotNull(int list_idx) const;
-
-    /// Coalesce two free buddies, 'b1' and 'b2'. Frees 'b1' and 'b2' and marks the parent
-    /// not in use.
-    std::unique_ptr<Suballocation> CoalesceBuddies(std::unique_ptr<Suballocation> b1,
-                                                   std::unique_ptr<Suballocation> b2);
-
-    /// The pool and corresponding client to allocate buffers from.
-    BufferPool* pool_;
-    BufferPool::ClientHandle* client_;
-
-    /// The minimum length of buffer to allocate. To serve allocations below this threshold,
-    /// a larger buffer is allocated and split into multiple allocations.
-    const int64_t min_buffer_len_;
-
-    /// Track how much memory has been returned in allocations but not freed.
-    int64_t allocated_;
-
-    /// Free lists for each supported power-of-two size. Statically allocate the maximum
-    /// possible number of lists for simplicity. Indexed by log2 of the allocation size
-    /// minus log2 of the minimum allocation size, e.g. 16k allocations are at index 2.
-    /// Each free list should only include one buddy of each pair: if both buddies are
-    /// free, they should have been coalesced.
-    ///
-    /// Each free list is implemented as a doubly-linked list.
-    static constexpr int NUM_FREE_LISTS = LOG_MAX_ALLOCATION_BYTES - LOG_MIN_ALLOCATION_BYTES + 1;
-    std::unique_ptr<Suballocation> free_lists_[NUM_FREE_LISTS];
-};
-
-/// An allocation made by a Suballocator. Each allocation returned by Suballocator must
-/// be freed with Suballocator::Free().
-///
-/// Unique_ptr is used to manage ownership of these Suballocations as a guard against
-/// memory leaks. The owner of the unique_ptr is either:
-/// - client code, if the suballocation is in use
-/// - the free list array, if the suballocation is the head of a free list
-/// - the previous free list entry, if the suballocation is a subsequent free list entry
-/// - the suballocation's left child, if the suballocation is split
-class Suballocation {
-public:
-    // Checks that the allocation is not in use (therefore not leaked).
-    ~Suballocation() { DCHECK(!in_use_); }
-
-    uint8_t* data() const { return data_; }
-    int64_t len() const { return len_; }
-
-private:
-    friend class Suballocator;
-
-    DISALLOW_COPY_AND_ASSIGN(Suballocation);
-
-    /// Static constructor for Suballocation. Can fail if new fails to allocate memory.
-    static Status Create(std::unique_ptr<Suballocation>* new_suballocation);
-
-    // The actual constructor - Create() is used for its better error handling.
-    Suballocation()
-            : data_(nullptr), len_(-1), buddy_(nullptr), prev_free_(nullptr), in_use_(false) {}
-
-    /// The allocation's data and its length.
-    uint8_t* data_;
-    int64_t len_;
-
-    /// The buffer backing the Suballocation, if the Suballocation is backed by an entire
-    /// buffer. Otherwise uninitialized. 'buffer_' is open iff 'buddy_' is nullptr.
-    BufferPool::BufferHandle buffer_;
-
-    /// If this is a left child, the parent of this and its buddy. The parent's allocation
-    /// is the contiguous memory buffer comprised of the two allocations. We store the
-    /// parent in only the left child so that it is uniquely owned.
-    std::unique_ptr<Suballocation> parent_;
-
-    /// The buddy allocation of this allocation. The buddy's memory buffer is the same
-    /// size and adjacent in memory. Two buddy Suballocation objects have the same
-    /// lifetime: they are created in SplitToSize() and destroyed in CoalesceBuddies().
-    Suballocation* buddy_;
-
-    /// If this is in a free list, the next element in the list. nullptr if this is the last
-    /// element in the free list. This pointer owns the next element in the linked list,
-    /// which itself stores a raw back-pointer.
-    std::unique_ptr<Suballocation> next_free_;
-
-    /// If this is in a free list, the previous element in the list. nullptr if this is the
-    /// first element. If non-nullptr, this Suballocation is owned by 'prev_free_'.
-    Suballocation* prev_free_;
-
-    /// True if was returned from Allocate() and hasn't been freed yet, or if it has been
-    /// split into two child Suballocations.
-    bool in_use_;
-};
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/system_allocator.cc b/be/src/runtime/bufferpool/system_allocator.cc
deleted file mode 100644
index cc5f7b7a27..0000000000
--- a/be/src/runtime/bufferpool/system_allocator.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/bufferpool/system_allocator.h"
-
-#include <gperftools/malloc_extension.h>
-#include <sys/mman.h>
-
-#include "common/config.h"
-#include "gutil/strings/substitute.h"
-#include "runtime/thread_context.h"
-#include "util/bit_util.h"
-#include "util/error_util.h"
-
-// TODO: IMPALA-5073: this should eventually become the default once we are confident
-// that it is superior to allocating via TCMalloc.
-//DEFINE_bool(mmap_buffers, false,
-//    "(Experimental) If true, allocate buffers directly from the operating system "
-//    "instead of with TCMalloc.");
-
-//DEFINE_bool(madvise_huge_pages, true,
-//    "(Advanced) If true, advise operating system to back large memory buffers with huge "
-//    "pages");
-
-namespace doris {
-
-/// These are the page sizes on x86-64. We could parse /proc/meminfo to programmatically
-/// get this, but it is unlikely to change unless we port to a different architecture.
-static int64_t SMALL_PAGE_SIZE = 4LL * 1024;
-static int64_t HUGE_PAGE_SIZE = 2LL * 1024 * 1024;
-
-SystemAllocator::SystemAllocator(int64_t min_buffer_len) : min_buffer_len_(min_buffer_len) {
-    DCHECK(BitUtil::IsPowerOf2(min_buffer_len));
-}
-
-Status SystemAllocator::Allocate(int64_t len, BufferPool::BufferHandle* buffer) {
-    DCHECK_GE(len, min_buffer_len_);
-    DCHECK_LE(len, BufferPool::MAX_BUFFER_BYTES);
-    DCHECK(BitUtil::IsPowerOf2(len)) << len;
-
-    uint8_t* buffer_mem;
-    if (config::mmap_buffers) {
-        RETURN_IF_ERROR(AllocateViaMMap(len, &buffer_mem));
-    } else {
-        RETURN_IF_ERROR(AllocateViaMalloc(len, &buffer_mem));
-    }
-    buffer->Open(buffer_mem, len, CpuInfo::get_current_core());
-    return Status::OK();
-}
-
-Status SystemAllocator::AllocateViaMMap(int64_t len, uint8_t** buffer_mem) {
-    int64_t map_len = len;
-    bool use_huge_pages = len % HUGE_PAGE_SIZE == 0 && config::madvise_huge_pages;
-    if (use_huge_pages) {
-        // Map an extra huge page so we can fix up the alignment if needed.
-        map_len += HUGE_PAGE_SIZE;
-    }
-    CONSUME_THREAD_MEM_TRACKER(map_len);
-    uint8_t* mem = reinterpret_cast<uint8_t*>(
-            mmap(nullptr, map_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
-    if (mem == MAP_FAILED) {
-        RELEASE_THREAD_MEM_TRACKER(map_len);
-        return Status::BufferAllocFailed("mmap failed");
-    }
-
-    if (use_huge_pages) {
-        // mmap() may return memory that is not aligned to the huge page size. For the
-        // subsequent madvise() call to work well, we need to align it ourselves and
-        // unmap the memory on either side of the buffer that we don't need.
-        uintptr_t misalignment = reinterpret_cast<uintptr_t>(mem) % HUGE_PAGE_SIZE;
-        if (misalignment != 0) {
-            uintptr_t fixup = HUGE_PAGE_SIZE - misalignment;
-            munmap(mem, fixup);
-            RELEASE_THREAD_MEM_TRACKER(fixup);
-            mem += fixup;
-            map_len -= fixup;
-        }
-        munmap(mem + len, map_len - len);
-        RELEASE_THREAD_MEM_TRACKER(map_len - len);
-        DCHECK_EQ(reinterpret_cast<uintptr_t>(mem) % HUGE_PAGE_SIZE, 0) << mem;
-        // Mark the buffer as a candidate for promotion to huge pages. The Linux Transparent
-        // Huge Pages implementation will try to back the memory with a huge page if it is
-        // enabled. MADV_HUGEPAGE was introduced in 2.6.38, so we similarly need to skip this
-        // code if we are compiling against an older kernel.
-#ifdef MADV_HUGEPAGE
-        int rc;
-        // According to madvise() docs it may return EAGAIN to signal that we should retry.
-        do {
-            rc = madvise(mem, len, MADV_HUGEPAGE);
-        } while (rc == -1 && errno == EAGAIN);
-        DCHECK(rc == 0) << "madvise(MADV_HUGEPAGE) shouldn't fail" << errno;
-#endif
-    }
-    *buffer_mem = mem;
-    return Status::OK();
-}
-
-Status SystemAllocator::AllocateViaMalloc(int64_t len, uint8_t** buffer_mem) {
-    bool use_huge_pages = len % HUGE_PAGE_SIZE == 0 && config::madvise_huge_pages;
-    // Allocate, aligned to the page size that we expect to back the memory range.
-    // This ensures that it can be backed by a whole pages, rather than parts of pages.
-    size_t alignment = use_huge_pages ? HUGE_PAGE_SIZE : SMALL_PAGE_SIZE;
-    int rc = posix_memalign(reinterpret_cast<void**>(buffer_mem), alignment, len);
-#if !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER)
-    // Workaround ASAN bug where posix_memalign returns 0 even when allocation fails.
-    // It should instead return ENOMEM. See https://bugs.llvm.org/show_bug.cgi?id=32968.
-    if (rc == 0 && *buffer_mem == nullptr && len != 0) rc = ENOMEM;
-#endif
-    if (rc != 0) {
-        return Status::InternalError("posix_memalign() failed to allocate buffer: {}",
-                                     get_str_err_msg());
-    }
-    if (use_huge_pages) {
-#ifdef MADV_HUGEPAGE
-        // According to madvise() docs it may return EAGAIN to signal that we should retry.
-        do {
-            rc = madvise(*buffer_mem, len, MADV_HUGEPAGE);
-        } while (rc == -1 && errno == EAGAIN);
-        DCHECK(rc == 0) << "madvise(MADV_HUGEPAGE) shouldn't fail" << errno;
-#endif
-    }
-    return Status::OK();
-}
-
-void SystemAllocator::Free(BufferPool::BufferHandle&& buffer) {
-    if (config::mmap_buffers) {
-        int rc = munmap(buffer.data(), buffer.len());
-        RELEASE_THREAD_MEM_TRACKER(buffer.len());
-        DCHECK_EQ(rc, 0) << "Unexpected munmap() error: " << errno;
-    } else {
-        bool use_huge_pages = buffer.len() % HUGE_PAGE_SIZE == 0 && config::madvise_huge_pages;
-        if (use_huge_pages) {
-            // Undo the madvise so that is isn't a candidate to be newly backed by huge pages.
-            // We depend on TCMalloc's "aggressive decommit" mode decommitting the physical
-            // huge pages with madvise(DONTNEED) when we call free(). Otherwise, this huge
-            // page region may be divvied up and subsequently decommitted in smaller chunks,
-            // which may not actually release the physical memory, causing Impala physical
-            // memory usage to exceed the process limit.
-#ifdef MADV_NOHUGEPAGE
-            // According to madvise() docs it may return EAGAIN to signal that we should retry.
-            int rc;
-            do {
-                rc = madvise(buffer.data(), buffer.len(), MADV_NOHUGEPAGE);
-            } while (rc == -1 && errno == EAGAIN);
-            DCHECK(rc == 0) << "madvise(MADV_NOHUGEPAGE) shouldn't fail" << errno;
-#endif
-        }
-        free(buffer.data());
-    }
-    buffer.Reset(); // Avoid DCHECK in ~BufferHandle().
-}
-} // namespace doris
diff --git a/be/src/runtime/bufferpool/system_allocator.h b/be/src/runtime/bufferpool/system_allocator.h
deleted file mode 100644
index 83c3a4507b..0000000000
--- a/be/src/runtime/bufferpool/system_allocator.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "common/status.h"
-#include "runtime/bufferpool/buffer_pool.h"
-
-namespace doris {
-
-/// The underlying memory allocator for the buffer pool that allocates buffer memory from
-/// the operating system using mmap(). All buffers are allocated through the BufferPool's
-/// SystemAllocator. The allocator only handles allocating buffers that are power-of-two
-/// multiples of the minimum buffer length.
-class SystemAllocator {
-public:
-    SystemAllocator(int64_t min_buffer_len);
-
-    /// Allocate memory for a buffer of 'len' bytes. 'len' must be a power-of-two multiple
-    /// of the minimum buffer length.
-    Status Allocate(int64_t len, BufferPool::BufferHandle* buffer) WARN_UNUSED_RESULT;
-
-    /// Free the memory for a previously-allocated buffer.
-    void Free(BufferPool::BufferHandle&& buffer);
-
-private:
-    /// Allocate 'len' bytes of memory for a buffer via mmap().
-    Status AllocateViaMMap(int64_t len, uint8_t** buffer_mem);
-
-    /// Allocate 'len' bytes of memory for a buffer via our malloc implementation.
-    Status AllocateViaMalloc(int64_t len, uint8_t** buffer_mem);
-
-    const int64_t min_buffer_len_;
-};
-} // namespace doris
diff --git a/be/src/runtime/disk_io_mgr.cc b/be/src/runtime/disk_io_mgr.cc
deleted file mode 100644
index 702ee127a2..0000000000
--- a/be/src/runtime/disk_io_mgr.cc
+++ /dev/null
@@ -1,1195 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/disk-io-mgr.cc
-// and modified by Doris
-
-#include "runtime/disk_io_mgr.h"
-
-#include <boost/algorithm/string.hpp>
-
-#include "runtime/disk_io_mgr_internal.h"
-#include "runtime/exec_env.h"
-#include "runtime/thread_context.h"
-
-using std::string;
-using std::stringstream;
-using std::vector;
-using std::list;
-using std::endl;
-
-using std::lock_guard;
-using std::unique_lock;
-using std::mutex;
-using std::thread;
-
-// Returns the ceil of value/divisor
-static int64_t bit_ceil(int64_t value, int64_t divisor) {
-    return value / divisor + (value % divisor != 0);
-}
-
-// Returns ceil(log2(x)).
-// TODO: this could be faster if we use __builtin_clz.  Fix this if this ever shows up
-// in a hot path.
-static int bit_log2(uint64_t x) {
-    DCHECK_GT(x, 0);
-    if (x == 1) {
-        return 0;
-    }
-    // Compute result = ceil(log2(x))
-    //                = floor(log2(x - 1)) + 1, for x > 1
-    // by finding the position of the most significant bit (1-indexed) of x - 1
-    // (floor(log2(n)) = MSB(n) (0-indexed))
-    --x;
-    int result = 1;
-    while (x >>= 1) {
-        ++result;
-    }
-    return result;
-}
-
-namespace doris {
-
-// Rotational disks should have 1 thread per disk to minimize seeks.  Non-rotational
-// don't have this penalty and benefit from multiple concurrent IO requests.
-static const int THREADS_PER_ROTATIONAL_DISK = 1;
-static const int THREADS_PER_FLASH_DISK = 8;
-
-// The IoMgr is able to run with a wide range of memory usage. If a query has memory
-// remaining less than this value, the IoMgr will stop all buffering regardless of the
-// current queue size.
-static const int LOW_MEMORY = 64 * 1024 * 1024;
-
-const int DiskIoMgr::DEFAULT_QUEUE_CAPACITY = 2;
-
-// namespace detail {
-// Indicates if file handle caching should be used
-// static inline bool is_file_handle_caching_enabled() {
-//     return config::max_cached_file_handles > 0;
-// }
-// }
-
-// This method is used to clean up resources upon eviction of a cache file handle.
-// void DiskIoMgr::HdfsCachedFileHandle::release(DiskIoMgr::HdfsCachedFileHandle** h) {
-//   VLOG_FILE << "Cached file handle evicted, hdfsCloseFile() fid=" << (*h)->_hdfs_file;
-//   delete (*h);
-// }
-
-// DiskIoMgr::HdfsCachedFileHandle::HdfsCachedFileHandle(const hdfsFS& fs, const char* fname,
-//     int64_t mtime)
-//     : _fs(fs), _hdfs_file(hdfsOpenFile(fs, fname, O_RDONLY, 0, 0, 0)), _mtime(mtime) {
-//   VLOG_FILE << "hdfsOpenFile() file=" << fname << " fid=" << _hdfs_file;
-// }
-
-// DiskIoMgr::HdfsCachedFileHandle::~HdfsCachedFileHandle() {
-//   if (_hdfs_file != nullptr && _fs != nullptr) {
-//     VLOG_FILE << "hdfsCloseFile() fid=" << _hdfs_file;
-//     hdfsCloseFile(_fs, _hdfs_file);
-//   }
-//   _fs = nullptr;
-//   _hdfs_file = nullptr;
-// }
-
-// This class provides a cache of RequestContext objects.  RequestContexts are recycled.
-// This is good for locality as well as lock contention.  The cache has the property that
-// regardless of how many clients get added/removed, the memory locations for
-// existing clients do not change (not the case with std::vector) minimizing the locks we
-// have to take across all readers.
-// All functions on this object are thread safe
-class DiskIoMgr::RequestContextCache {
-public:
-    RequestContextCache(DiskIoMgr* io_mgr) : _io_mgr(io_mgr) {}
-
-    // Returns a context to the cache.  This object can now be reused.
-    void return_context(RequestContext* reader) {
-        DCHECK(reader->_state != RequestContext::Inactive);
-        reader->_state = RequestContext::Inactive;
-        lock_guard<mutex> l(_lock);
-        _inactive_contexts.push_back(reader);
-    }
-
-    // Returns a new RequestContext object.  Allocates a new object if necessary.
-    RequestContext* get_new_context() {
-        lock_guard<mutex> l(_lock);
-        if (!_inactive_contexts.empty()) {
-            RequestContext* reader = _inactive_contexts.front();
-            _inactive_contexts.pop_front();
-            return reader;
-        } else {
-            RequestContext* reader = new RequestContext(_io_mgr, _io_mgr->num_total_disks());
-            _all_contexts.push_back(reader);
-            return reader;
-        }
-    }
-
-    // This object has the same lifetime as the disk IoMgr.
-    ~RequestContextCache() {
-        for (list<RequestContext*>::iterator it = _all_contexts.begin(); it != _all_contexts.end();
-             ++it) {
-            delete *it;
-        }
-    }
-
-    // Validates that all readers are cleaned up and in the inactive state.
-    bool validate_all_inactive() {
-        lock_guard<mutex> l(_lock);
-        for (list<RequestContext*>::iterator it = _all_contexts.begin(); it != _all_contexts.end();
-             ++it) {
-            if ((*it)->_state != RequestContext::Inactive) {
-                return false;
-            }
-        }
-        DCHECK_EQ(_all_contexts.size(), _inactive_contexts.size());
-        return _all_contexts.size() == _inactive_contexts.size();
-    }
-
-    string debug_string();
-
-private:
-    DiskIoMgr* _io_mgr;
-
-    // lock to protect all members below
-    mutex _lock;
-
-    // List of all request contexts created.  Used for debugging
-    list<RequestContext*> _all_contexts;
-
-    // List of inactive readers.  These objects can be used for a new reader.
-    list<RequestContext*> _inactive_contexts;
-};
-
-string DiskIoMgr::RequestContextCache::debug_string() {
-    lock_guard<mutex> l(_lock);
-    stringstream ss;
-    for (list<RequestContext*>::iterator it = _all_contexts.begin(); it != _all_contexts.end();
-         ++it) {
-        unique_lock<mutex> lock((*it)->_lock);
-        ss << (*it)->debug_string() << endl;
-    }
-    return ss.str();
-}
-
-string DiskIoMgr::debug_string() {
-    stringstream ss;
-    ss << "RequestContexts: " << endl << _request_context_cache->debug_string() << endl;
-
-    ss << "Disks: " << endl;
-    for (int i = 0; i < _disk_queues.size(); ++i) {
-        unique_lock<mutex> lock(_disk_queues[i]->lock);
-        ss << "  " << (void*)_disk_queues[i] << ":";
-        if (!_disk_queues[i]->request_contexts.empty()) {
-            ss << " Readers: ";
-            for (RequestContext* req_context : _disk_queues[i]->request_contexts) {
-                ss << (void*)req_context;
-            }
-        }
-        ss << endl;
-    }
-    return ss.str();
-}
-
-DiskIoMgr::BufferDescriptor::BufferDescriptor(DiskIoMgr* io_mgr)
-        : _io_mgr(io_mgr), _reader(nullptr), _buffer(nullptr) {}
-
-void DiskIoMgr::BufferDescriptor::reset(RequestContext* reader, ScanRange* range, char* buffer,
-                                        int64_t buffer_len) {
-    DCHECK(_io_mgr != nullptr);
-    DCHECK(_buffer == nullptr);
-    DCHECK(range != nullptr);
-    DCHECK(buffer != nullptr);
-    DCHECK_GE(buffer_len, 0);
-    _reader = reader;
-    _scan_range = range;
-    _buffer = buffer;
-    _buffer_len = buffer_len;
-    _len = 0;
-    _eosr = false;
-    _status = Status::OK();
-}
-
-void DiskIoMgr::BufferDescriptor::return_buffer() {
-    DCHECK(_io_mgr != nullptr);
-    _io_mgr->return_buffer(this);
-}
-
-DiskIoMgr::WriteRange::WriteRange(const string& file, int64_t file_offset, int disk_id,
-                                  WriteDoneCallback callback) {
-    _file = file;
-    _offset = file_offset;
-    _disk_id = disk_id;
-    _callback = callback;
-    _request_type = RequestType::WRITE;
-}
-
-void DiskIoMgr::WriteRange::set_data(const uint8_t* buffer, int64_t len) {
-    _data = buffer;
-    _len = len;
-}
-
-static void check_sse_support() {
-    if (!CpuInfo::is_supported(CpuInfo::SSE4_2)) {
-        LOG(WARNING) << "This machine does not support sse4_2.  The default IO system "
-                        "configurations are suboptimal for this hardware.  Consider "
-                        "increasing the number of threads per disk by restarting doris "
-                        "using the --num_threads_per_disk flag with a higher value";
-    }
-}
-
-DiskIoMgr::DiskIoMgr()
-        : _num_threads_per_disk(config::num_threads_per_disk),
-          _max_buffer_size(config::read_size),
-          _min_buffer_size(config::min_buffer_size),
-          _cached_read_options(nullptr),
-          _shut_down(false),
-          _total_bytes_read_counter(TUnit::BYTES),
-          _read_timer(TUnit::TIME_NS)
-// _read_timer(TUnit::TIME_NS),
-// _file_handle_cache(
-//         std::min((uint64_t)config::max_cached_file_handles, FileSystemUtil::max_num_file_handles()),
-//         &HdfsCachedFileHandle::release) {
-{
-    int64_t max_buffer_size_scaled = bit_ceil(_max_buffer_size, _min_buffer_size);
-    _free_buffers.resize(bit_log2(max_buffer_size_scaled) + 1);
-    int num_local_disks = (config::num_disks == 0 ? DiskInfo::num_disks() : config::num_disks);
-    _disk_queues.resize(num_local_disks + REMOTE_NUM_DISKS);
-    check_sse_support();
-}
-
-DiskIoMgr::DiskIoMgr(int num_local_disks, int threads_per_disk, int min_buffer_size,
-                     int max_buffer_size)
-        : _num_threads_per_disk(threads_per_disk),
-          _max_buffer_size(max_buffer_size),
-          _min_buffer_size(min_buffer_size),
-          _cached_read_options(nullptr),
-          _shut_down(false),
-          _total_bytes_read_counter(TUnit::BYTES),
-          _read_timer(TUnit::TIME_NS)
-// _read_timer(TUnit::TIME_NS),
-// _file_handle_cache(::min(config::max_cached_file_handles,
-//             FileSystemUtil::max_num_file_handles()), &HdfsCachedFileHandle::release) {
-{
-    int64_t max_buffer_size_scaled = bit_ceil(_max_buffer_size, _min_buffer_size);
-    _free_buffers.resize(bit_log2(max_buffer_size_scaled) + 1);
-    if (num_local_disks == 0) {
-        num_local_disks = DiskInfo::num_disks();
-    }
-    _disk_queues.resize(num_local_disks + REMOTE_NUM_DISKS);
-    check_sse_support();
-}
-
-DiskIoMgr::~DiskIoMgr() {
-    _shut_down = true;
-    // Notify all worker threads and shut them down.
-    for (int i = 0; i < _disk_queues.size(); ++i) {
-        if (_disk_queues[i] == nullptr) {
-            continue;
-        }
-        {
-            // This lock is necessary to properly use the condition var to notify
-            // the disk worker threads.  The readers also grab this lock so updates
-            // to _shut_down are protected.
-            unique_lock<mutex> disk_lock(_disk_queues[i]->lock);
-        }
-        _disk_queues[i]->work_available.notify_all();
-    }
-    _disk_thread_group.join_all();
-
-    for (int i = 0; i < _disk_queues.size(); ++i) {
-        if (_disk_queues[i] == nullptr) {
-            continue;
-        }
-        int disk_id = _disk_queues[i]->disk_id;
-        for (list<RequestContext*>::iterator it = _disk_queues[i]->request_contexts.begin();
-             it != _disk_queues[i]->request_contexts.end(); ++it) {
-            DCHECK_EQ((*it)->_disk_states[disk_id].num_threads_in_op(), 0);
-            DCHECK((*it)->_disk_states[disk_id].done());
-            (*it)->decrement_disk_ref_count();
-        }
-    }
-
-    DCHECK(_request_context_cache.get() == nullptr ||
-           _request_context_cache->validate_all_inactive())
-            << endl
-            << debug_string();
-    DCHECK_EQ(_num_buffers_in_readers, 0);
-
-    // Delete all allocated buffers
-    int num_free_buffers = 0;
-    for (int idx = 0; idx < _free_buffers.size(); ++idx) {
-        num_free_buffers += _free_buffers[idx].size();
-    }
-    DCHECK_EQ(_num_allocated_buffers, num_free_buffers);
-    gc_io_buffers();
-
-    for (int i = 0; i < _disk_queues.size(); ++i) {
-        delete _disk_queues[i];
-    }
-
-    /*
-     * if (_cached_read_options != nullptr) {
-     *     hadoopRzOptionsFree(_cached_read_options);
-     * }
-     */
-}
-
-Status DiskIoMgr::init(const int64_t mem_limit) {
-    _mem_tracker = std::make_unique<MemTrackerLimiter>(MemTrackerLimiter::Type::GLOBAL, "DiskIO",
-                                                       mem_limit);
-
-    for (int i = 0; i < _disk_queues.size(); ++i) {
-        _disk_queues[i] = new DiskQueue(i);
-        int num_threads_per_disk = 0;
-        if (i >= num_local_disks()) {
-            // remote disks, do nothing
-            continue;
-        } else if (_num_threads_per_disk != 0) {
-            num_threads_per_disk = _num_threads_per_disk;
-        } else if (DiskInfo::is_rotational(i)) {
-            num_threads_per_disk = THREADS_PER_ROTATIONAL_DISK;
-        } else {
-            num_threads_per_disk = THREADS_PER_FLASH_DISK;
-        }
-        for (int j = 0; j < num_threads_per_disk; ++j) {
-            stringstream ss;
-            ss << "work-loop(Disk: " << i << ", Thread: " << j << ")";
-            // _disk_thread_group.AddThread(new Thread("disk-io-mgr", ss.str(),
-            //             &DiskIoMgr::work_loop, this, _disk_queues[i]));
-            _disk_thread_group.add_thread(
-                    new std::thread(std::bind(&DiskIoMgr::work_loop, this, _disk_queues[i])));
-        }
-    }
-    _request_context_cache.reset(new RequestContextCache(this));
-
-    // _cached_read_options = hadoopRzOptionsAlloc();
-    // DCHECK(_cached_read_options != nullptr);
-    // Disable checksum for cached reads.
-    // int ret = hadoopRzOptionsSetSkipChecksum(_cached_read_options, true);
-    // DCHECK_EQ(ret, 0);
-    // Disable automatic fallback for cached reads.
-    // ret = hadoopRzOptionsSetByteBufferPool(_cached_read_options, nullptr);
-    // DCHECK_EQ(ret, 0);
-
-    return Status::OK();
-}
-
-Status DiskIoMgr::register_context(RequestContext** request_context) {
-    DCHECK(_request_context_cache) << "Must call init() first.";
-    *request_context = _request_context_cache->get_new_context();
-    (*request_context)->reset();
-    return Status::OK();
-}
-
-void DiskIoMgr::unregister_context(RequestContext* reader) {
-    // Blocking cancel (waiting for disks completion).
-    cancel_context(reader, true);
-
-    // All the disks are done with clean, validate nothing is leaking.
-    unique_lock<mutex> reader_lock(reader->_lock);
-    DCHECK_EQ(reader->_num_buffers_in_reader, 0) << endl << reader->debug_string();
-    DCHECK_EQ(reader->_num_used_buffers, 0) << endl << reader->debug_string();
-
-    DCHECK(reader->validate()) << endl << reader->debug_string();
-    _request_context_cache->return_context(reader);
-}
-
-// Cancellation requires coordination from multiple threads.  Each thread that currently
-// has a reference to the request context must notice the cancel and remove it from its
-// tracking structures.  The last thread to touch the context should deallocate (aka
-// recycle) the request context object.  Potential threads are:
-//  1. Disk threads that are currently reading for this reader.
-//  2. Caller threads that are waiting in get_next.
-//
-// The steps are:
-// 1. Cancel will immediately set the context in the Cancelled state.  This prevents any
-// other thread from adding more ready buffers to the context (they all take a lock and
-// check the state before doing so), or any write ranges to the context.
-// 2. Cancel will call cancel on each ScanRange that is not yet complete, unblocking
-// any threads in get_next(). The reader will see the cancelled Status returned. Cancel
-// also invokes the callback for the WriteRanges with the cancelled state.
-// 3. Disk threads notice the context is cancelled either when picking the next context
-// to process or when they try to enqueue a ready buffer.  Upon noticing the cancelled
-// state, removes the context from the disk queue.  The last thread per disk with an
-// outstanding reference to the context decrements the number of disk queues the context
-// is on.
-// If wait_for_disks_completion is true, wait for the number of active disks to become 0.
-void DiskIoMgr::cancel_context(RequestContext* context, bool wait_for_disks_completion) {
-    context->cancel(Status::Cancelled("Cancelled"));
-
-    if (wait_for_disks_completion) {
-        unique_lock<mutex> lock(context->_lock);
-        DCHECK(context->validate()) << endl << context->debug_string();
-        while (context->_num_disks_with_ranges > 0) {
-            context->_disks_complete_cond_var.wait(lock);
-        }
-    }
-}
-
-void DiskIoMgr::set_read_timer(RequestContext* r, RuntimeProfile::Counter* c) {
-    r->_read_timer = c;
-}
-
-void DiskIoMgr::set_bytes_read_counter(RequestContext* r, RuntimeProfile::Counter* c) {
-    r->_bytes_read_counter = c;
-}
-
-void DiskIoMgr::set_active_read_thread_counter(RequestContext* r, RuntimeProfile::Counter* c) {
-    r->_active_read_thread_counter = c;
-}
-
-void DiskIoMgr::set_disks_access_bitmap(RequestContext* r, RuntimeProfile::Counter* c) {
-    r->_disks_accessed_bitmap = c;
-}
-
-int64_t DiskIoMgr::queue_size(RequestContext* reader) const {
-    return reader->_num_ready_buffers;
-}
-
-Status DiskIoMgr::context_status(RequestContext* context) const {
-    unique_lock<mutex> lock(context->_lock);
-    return context->_status;
-}
-
-int DiskIoMgr::num_unstarted_ranges(RequestContext* reader) const {
-    return reader->_num_unstarted_scan_ranges;
-}
-
-int64_t DiskIoMgr::bytes_read_local(RequestContext* reader) const {
-    return reader->_bytes_read_local;
-}
-
-int64_t DiskIoMgr::bytes_read_short_circuit(RequestContext* reader) const {
-    return reader->_bytes_read_short_circuit;
-}
-
-int64_t DiskIoMgr::bytes_read_dn_cache(RequestContext* reader) const {
-    return reader->_bytes_read_dn_cache;
-}
-
-int DiskIoMgr::num_remote_ranges(RequestContext* reader) const {
-    return reader->_num_remote_ranges;
-}
-
-int64_t DiskIoMgr::unexpected_remote_bytes(RequestContext* reader) const {
-    return reader->_unexpected_remote_bytes;
-}
-
-int64_t DiskIoMgr::get_read_throughput() {
-    return RuntimeProfile::units_per_second(&_total_bytes_read_counter, &_read_timer);
-}
-
-Status DiskIoMgr::validate_scan_range(ScanRange* range) {
-    int disk_id = range->_disk_id;
-    if (disk_id < 0 || disk_id >= _disk_queues.size()) {
-        stringstream ss;
-        ss << "Invalid scan range.  Bad disk id: " << disk_id;
-        DCHECK(false) << ss.str();
-        return Status::InternalError(ss.str());
-    }
-    return Status::OK();
-}
-
-Status DiskIoMgr::add_scan_ranges(RequestContext* reader, const vector<ScanRange*>& ranges,
-                                  bool schedule_immediately) {
-    if (ranges.empty()) {
-        return Status::OK();
-    }
-
-    // Validate and initialize all ranges
-    for (int i = 0; i < ranges.size(); ++i) {
-        RETURN_IF_ERROR(validate_scan_range(ranges[i]));
-        ranges[i]->init_internal(this, reader);
-    }
-
-    // disks that this reader needs to be scheduled on.
-    unique_lock<mutex> reader_lock(reader->_lock);
-    DCHECK(reader->validate()) << endl << reader->debug_string();
-
-    if (reader->_state == RequestContext::Cancelled) {
-        DCHECK(!reader->_status.ok());
-        return reader->_status;
-    }
-
-    // Add each range to the queue of the disk the range is on
-    for (int i = 0; i < ranges.size(); ++i) {
-        // Don't add empty ranges.
-        DCHECK_NE(ranges[i]->len(), 0);
-        ScanRange* range = ranges[i];
-
-        /*
-         * if (range->_try_cache) {
-         *     if (schedule_immediately) {
-         *         bool cached_read_succeeded;
-         *         RETURN_IF_ERROR(range->read_from_cache(&cached_read_succeeded));
-         *         if (cached_read_succeeded) continue;
-         *         // Cached read failed, fall back to add_request_range() below.
-         *     } else {
-         *         reader->_cached_ranges.enqueue(range);
-         *         continue;
-         *     }
-         * }
-         */
-        reader->add_request_range(range, schedule_immediately);
-    }
-    DCHECK(reader->validate()) << endl << reader->debug_string();
-
-    return Status::OK();
-}
-
-// This function returns the next scan range the reader should work on, checking
-// for eos and error cases. If there isn't already a cached scan range or a scan
-// range prepared by the disk threads, the caller waits on the disk threads.
-Status DiskIoMgr::get_next_range(RequestContext* reader, ScanRange** range) {
-    DCHECK(reader != nullptr);
-    DCHECK(range != nullptr);
-    *range = nullptr;
-    Status status = Status::OK();
-
-    unique_lock<mutex> reader_lock(reader->_lock);
-    DCHECK(reader->validate()) << endl << reader->debug_string();
-
-    while (true) {
-        if (reader->_state == RequestContext::Cancelled) {
-            DCHECK(!reader->_status.ok());
-            status = reader->_status;
-            break;
-        }
-
-        if (reader->_num_unstarted_scan_ranges == 0 && reader->_ready_to_start_ranges.empty() &&
-            reader->_cached_ranges.empty()) {
-            // All ranges are done, just return.
-            break;
-        }
-
-        // if (!reader->_cached_ranges.empty()) {
-        //     // We have a cached range.
-        //     *range = reader->_cached_ranges.dequeue();
-        //     DCHECK((*range)->_try_cache);
-        //     // bool cached_read_succeeded;
-        //     // RETURN_IF_ERROR((*range)->read_from_cache(&cached_read_succeeded));
-        //     // if (cached_read_succeeded) return Status::OK();
-
-        //     // This range ended up not being cached. Loop again and pick up a new range.
-        //     reader->add_request_range(*range, false);
-        //     DCHECK(reader->validate()) << endl << reader->debug_string();
-        //     *range = nullptr;
-        //     continue;
-        // }
-
-        if (reader->_ready_to_start_ranges.empty()) {
-            reader->_ready_to_start_ranges_cv.wait(reader_lock);
-        } else {
-            *range = reader->_ready_to_start_ranges.dequeue();
-            DCHECK(*range != nullptr);
-            int disk_id = (*range)->disk_id();
-            DCHECK_EQ(*range, reader->_disk_states[disk_id].next_scan_range_to_start());
-            // Set this to nullptr, the next time this disk runs for this reader, it will
-            // get another range ready.
-            reader->_disk_states[disk_id].set_next_scan_range_to_start(nullptr);
-            reader->schedule_scan_range(*range);
-            break;
-        }
-    }
-    return status;
-}
-
-Status DiskIoMgr::read(RequestContext* reader, ScanRange* range, BufferDescriptor** buffer) {
-    DCHECK(range != nullptr);
-    DCHECK(buffer != nullptr);
-    *buffer = nullptr;
-
-    if (range->len() > _max_buffer_size) {
-        return Status::InternalError("Cannot perform sync read larger than {}. Request was {}",
-                                     _max_buffer_size, range->len());
-    }
-
-    vector<DiskIoMgr::ScanRange*> ranges;
-    ranges.push_back(range);
-    RETURN_IF_ERROR(add_scan_ranges(reader, ranges, true));
-    RETURN_IF_ERROR(range->get_next(buffer));
-    DCHECK((*buffer) != nullptr);
-    DCHECK((*buffer)->eosr());
-    return Status::OK();
-}
-
-void DiskIoMgr::return_buffer(BufferDescriptor* buffer_desc) {
-    DCHECK(buffer_desc != nullptr);
-    if (!buffer_desc->_status.ok()) {
-        DCHECK(buffer_desc->_buffer == nullptr);
-    }
-
-    RequestContext* reader = buffer_desc->_reader;
-    if (buffer_desc->_buffer != nullptr) {
-        if (buffer_desc->_scan_range->_cached_buffer == nullptr) {
-            // Not a cached buffer. Return the io buffer and update mem tracking.
-            return_free_buffer(buffer_desc);
-        }
-        buffer_desc->_buffer = nullptr;
-        --_num_buffers_in_readers;
-        --reader->_num_buffers_in_reader;
-    } else {
-        // A nullptr buffer means there was an error in which case there is no buffer
-        // to return.
-    }
-
-    if (buffer_desc->_eosr || buffer_desc->_scan_range->_is_cancelled) {
-        // Need to close the scan range if returning the last buffer or the scan range
-        // has been cancelled (and the caller might never get the last buffer).
-        // close() is idempotent so multiple cancelled buffers is okay.
-        buffer_desc->_scan_range->close();
-    }
-    return_buffer_desc(buffer_desc);
-}
-
-void DiskIoMgr::return_buffer_desc(BufferDescriptor* desc) {
-    DCHECK(desc != nullptr);
-    unique_lock<mutex> lock(_free_buffers_lock);
-    DCHECK(find(_free_buffer_descs.begin(), _free_buffer_descs.end(), desc) ==
-           _free_buffer_descs.end());
-    _free_buffer_descs.push_back(desc);
-}
-
-DiskIoMgr::BufferDescriptor* DiskIoMgr::get_buffer_desc(RequestContext* reader, ScanRange* range,
-                                                        char* buffer, int64_t buffer_size) {
-    BufferDescriptor* buffer_desc = nullptr;
-    {
-        unique_lock<mutex> lock(_free_buffers_lock);
-        if (_free_buffer_descs.empty()) {
-            buffer_desc = _pool.add(new BufferDescriptor(this));
-        } else {
-            buffer_desc = _free_buffer_descs.front();
-            _free_buffer_descs.pop_front();
-        }
-    }
-    buffer_desc->reset(reader, range, buffer, buffer_size);
-    return buffer_desc;
-}
-
-char* DiskIoMgr::get_free_buffer(int64_t* buffer_size) {
-    DCHECK_LE(*buffer_size, _max_buffer_size);
-    DCHECK_GT(*buffer_size, 0);
-    *buffer_size = std::min(static_cast<int64_t>(_max_buffer_size), *buffer_size);
-    int idx = free_buffers_idx(*buffer_size);
-    // Quantize buffer size to nearest power of 2 greater than the specified buffer size and
-    // convert to bytes
-    *buffer_size = (1 << idx) * _min_buffer_size;
-
-    unique_lock<mutex> lock(_free_buffers_lock);
-    char* buffer = nullptr;
-    if (_free_buffers[idx].empty()) {
-        ++_num_allocated_buffers;
-        buffer = new char[*buffer_size];
-    } else {
-        // This means the buffer's memory ownership is transferred from DiskIoMgr to tls tracker.
-        THREAD_MEM_TRACKER_TRANSFER_FROM(*buffer_size, _mem_tracker.get());
-        buffer = _free_buffers[idx].front();
-        _free_buffers[idx].pop_front();
-    }
-    DCHECK(buffer != nullptr);
-    return buffer;
-}
-
-void DiskIoMgr::gc_io_buffers(int64_t bytes_to_free) {
-    unique_lock<mutex> lock(_free_buffers_lock);
-    int bytes_freed = 0;
-    for (int idx = 0; idx < _free_buffers.size(); ++idx) {
-        for (list<char*>::iterator iter = _free_buffers[idx].begin();
-             iter != _free_buffers[idx].end(); ++iter) {
-            int64_t buffer_size = (1 << idx) * _min_buffer_size;
-            --_num_allocated_buffers;
-            delete[] * iter;
-
-            bytes_freed += buffer_size;
-        }
-        _free_buffers[idx].clear();
-        if (bytes_freed >= bytes_to_free) {
-            break;
-        }
-    }
-    // The deleted buffer is released in the tls mem tracker, the deleted buffer belongs to DiskIoMgr,
-    // so the freed memory should be recorded in the DiskIoMgr mem tracker. So if the tls mem tracker
-    // and the DiskIoMgr tracker are different, transfer memory ownership.
-    THREAD_MEM_TRACKER_TRANSFER_FROM(bytes_freed, _mem_tracker.get());
-}
-
-void DiskIoMgr::return_free_buffer(BufferDescriptor* desc) {
-    return_free_buffer(desc->_buffer, desc->_buffer_len);
-}
-
-void DiskIoMgr::return_free_buffer(char* buffer, int64_t buffer_size) {
-    DCHECK(buffer != nullptr);
-    int idx = free_buffers_idx(buffer_size);
-    DCHECK_EQ(bit_ceil(buffer_size, _min_buffer_size) & ~(1 << idx), 0)
-            << "_buffer_size / _min_buffer_size should be power of 2, got buffer_size = "
-            << buffer_size << ", _min_buffer_size = " << _min_buffer_size;
-    unique_lock<mutex> lock(_free_buffers_lock);
-    if (!config::disable_mem_pools && _free_buffers[idx].size() < config::max_free_io_buffers) {
-        // The buffer's memory ownership is transferred from desc->buffer_mem_tracker to DiskIoMgr tracker.
-        THREAD_MEM_TRACKER_TRANSFER_TO(buffer_size, _mem_tracker.get());
-        _free_buffers[idx].push_back(buffer);
-    } else {
-        --_num_allocated_buffers;
-        delete[] buffer;
-    }
-}
-
-// This function gets the next RequestRange to work on for this disk. It checks for
-// cancellation and
-// a) Updates ready_to_start_ranges if there are no scan ranges queued for this disk.
-// b) Adds an unstarted write range to _in_flight_ranges. The write range is processed
-//    immediately if there are no preceding scan ranges in _in_flight_ranges
-// It blocks until work is available or the thread is shut down.
-// Work is available if there is a RequestContext with
-//  - A ScanRange with a buffer available, or
-//  - A WriteRange in _unstarted_write_ranges.
-bool DiskIoMgr::get_next_request_range(DiskQueue* disk_queue, RequestRange** range,
-                                       RequestContext** request_context) {
-    int disk_id = disk_queue->disk_id;
-    *range = nullptr;
-
-    // This loops returns either with work to do or when the disk IoMgr shuts down.
-    while (!_shut_down) {
-        *request_context = nullptr;
-        RequestContext::PerDiskState* request_disk_state = nullptr;
-        {
-            unique_lock<mutex> disk_lock(disk_queue->lock);
-
-            while (!_shut_down && disk_queue->request_contexts.empty()) {
-                // wait if there are no readers on the queue
-                disk_queue->work_available.wait(disk_lock);
-            }
-            if (_shut_down) {
-                break;
-            }
-            DCHECK(!disk_queue->request_contexts.empty());
-
-            // Get the next reader and remove the reader so that another disk thread
-            // can't pick it up.  It will be enqueued before issuing the read to HDFS
-            // so this is not a big deal (i.e. multiple disk threads can read for the
-            // same reader).
-            // TODO: revisit.
-            *request_context = disk_queue->request_contexts.front();
-            disk_queue->request_contexts.pop_front();
-            DCHECK(*request_context != nullptr);
-            request_disk_state = &((*request_context)->_disk_states[disk_id]);
-            request_disk_state->increment_request_thread_and_dequeue();
-        }
-
-        // NOTE: no locks were taken in between.  We need to be careful about what state
-        // could have changed to the reader and disk in between.
-        // There are some invariants here.  Only one disk thread can have the
-        // same reader here (the reader is removed from the queue).  There can be
-        // other disk threads operating on this reader in other functions though.
-
-        unique_lock<mutex> request_lock((*request_context)->_lock);
-        VLOG_FILE << "Disk (id=" << disk_id << ") reading for "
-                  << (*request_context)->debug_string();
-
-        // Check if reader has been cancelled
-        if ((*request_context)->_state == RequestContext::Cancelled) {
-            request_disk_state->decrement_request_thread_and_check_done(*request_context);
-            continue;
-        }
-
-        DCHECK_EQ((*request_context)->_state, RequestContext::Active)
-                << (*request_context)->debug_string();
-
-        if (request_disk_state->next_scan_range_to_start() == nullptr &&
-            !request_disk_state->unstarted_scan_ranges()->empty()) {
-            // We don't have a range queued for this disk for what the caller should
-            // read next. Populate that.  We want to have one range waiting to minimize
-            // wait time in get_next_range.
-            ScanRange* new_range = request_disk_state->unstarted_scan_ranges()->dequeue();
-            --(*request_context)->_num_unstarted_scan_ranges;
-            (*request_context)->_ready_to_start_ranges.enqueue(new_range);
-            request_disk_state->set_next_scan_range_to_start(new_range);
-
-            if ((*request_context)->_num_unstarted_scan_ranges == 0) {
-                // All the ranges have been started, notify everyone blocked on get_next_range.
-                // Only one of them will get work so make sure to return nullptr to the other
-                // caller threads.
-                (*request_context)->_ready_to_start_ranges_cv.notify_all();
-            } else {
-                (*request_context)->_ready_to_start_ranges_cv.notify_one();
-            }
-        }
-
-        // Always enqueue a WriteRange to be processed into _in_flight_ranges.
-        // This is done so _in_flight_ranges does not exclusively contain ScanRanges.
-        // For now, enqueuing a WriteRange on each invocation of get_next_request_range()
-        // does not flood in_flight_ranges() with WriteRanges because the entire
-        // WriteRange is processed and removed from the queue after get_next_request_range()
-        // returns. (A DCHECK is used to ensure that writes do not exceed 8MB).
-        if (!request_disk_state->unstarted_write_ranges()->empty()) {
-            WriteRange* write_range = request_disk_state->unstarted_write_ranges()->dequeue();
-            request_disk_state->in_flight_ranges()->enqueue(write_range);
-        }
-
-        // Get the next scan range to work on from the reader. Only in_flight_ranges
-        // are eligible since the disk threads do not start new ranges on their own.
-
-        // There are no inflight ranges, nothing to do.
-        if (request_disk_state->in_flight_ranges()->empty()) {
-            request_disk_state->decrement_request_thread();
-            continue;
-        }
-        DCHECK_GT(request_disk_state->num_remaining_ranges(), 0);
-        *range = request_disk_state->in_flight_ranges()->dequeue();
-        DCHECK(*range != nullptr);
-
-        // Now that we've picked a request range, put the context back on the queue so
-        // another thread can pick up another request range for this context.
-        request_disk_state->schedule_context(*request_context, disk_id);
-        DCHECK((*request_context)->validate()) << endl << (*request_context)->debug_string();
-        return true;
-    }
-
-    DCHECK(_shut_down);
-    return false;
-}
-
-void DiskIoMgr::handle_write_finished(RequestContext* writer, WriteRange* write_range,
-                                      const Status& write_status) {
-    // Execute the callback before decrementing the thread count. Otherwise cancel_context()
-    // that waits for the disk ref count to be 0 will return, creating a race, e.g.
-    // between BufferedBlockMgr::WriteComplete() and BufferedBlockMgr::~BufferedBlockMgr().
-    // See IMPALA-1890.
-    // The status of the write does not affect the status of the writer context.
-    write_range->_callback(write_status);
-    {
-        unique_lock<mutex> writer_lock(writer->_lock);
-        DCHECK(writer->validate()) << endl << writer->debug_string();
-        RequestContext::PerDiskState& state = writer->_disk_states[write_range->_disk_id];
-        if (writer->_state == RequestContext::Cancelled) {
-            state.decrement_request_thread_and_check_done(writer);
-        } else {
-            state.decrement_request_thread();
-        }
-        --state.num_remaining_ranges();
-    }
-}
-
-void DiskIoMgr::handle_read_finished(DiskQueue* disk_queue, RequestContext* reader,
-                                     BufferDescriptor* buffer) {
-    unique_lock<mutex> reader_lock(reader->_lock);
-
-    RequestContext::PerDiskState& state = reader->_disk_states[disk_queue->disk_id];
-    DCHECK(reader->validate()) << endl << reader->debug_string();
-    DCHECK_GT(state.num_threads_in_op(), 0);
-    DCHECK(buffer->_buffer != nullptr);
-
-    if (reader->_state == RequestContext::Cancelled) {
-        state.decrement_request_thread_and_check_done(reader);
-        DCHECK(reader->validate()) << endl << reader->debug_string();
-        return_free_buffer(buffer);
-        buffer->_buffer = nullptr;
-        buffer->_scan_range->cancel(reader->_status);
-        // Enqueue the buffer to use the scan range's buffer cleanup path.
-        buffer->_scan_range->enqueue_buffer(buffer);
-        return;
-    }
-
-    DCHECK_EQ(reader->_state, RequestContext::Active);
-    DCHECK(buffer->_buffer != nullptr);
-
-    // Update the reader's scan ranges.  There are a three cases here:
-    //  1. Read error
-    //  2. End of scan range
-    //  3. Middle of scan range
-    if (!buffer->_status.ok()) {
-        // Error case
-        return_free_buffer(buffer);
-        buffer->_eosr = true;
-        --state.num_remaining_ranges();
-        buffer->_scan_range->cancel(buffer->_status);
-    } else if (buffer->_eosr) {
-        --state.num_remaining_ranges();
-    }
-
-    // After calling enqueue_buffer(), it is no longer valid to read from buffer.
-    // Store the state we need before calling enqueue_buffer().
-    bool eosr = buffer->_eosr;
-    ScanRange* scan_range = buffer->_scan_range;
-    bool queue_full = buffer->_scan_range->enqueue_buffer(buffer);
-    if (eosr) {
-        // For cached buffers, we can't close the range until the cached buffer is returned.
-        // close() is called from DiskIoMgr::return_buffer().
-        /*
-         * if (scan_range->_cached_buffer == nullptr) {
-         *     scan_range->close();
-         * }
-         */
-    } else {
-        if (queue_full) {
-            reader->_blocked_ranges.enqueue(scan_range);
-        } else {
-            reader->schedule_scan_range(scan_range);
-        }
-    }
-    state.decrement_request_thread();
-}
-
-void DiskIoMgr::work_loop(DiskQueue* disk_queue) {
-    // The thread waits until there is work or the entire system is being shut down.
-    // If there is work, performs the read or write requested and re-enqueues the
-    // requesting context.
-    // Locks are not taken when reading from or writing to disk.
-    // The main loop has three parts:
-    //   1. GetNextRequestContext(): get the next request context (read or write) to
-    //      process and dequeue it.
-    //   2. For the dequeued request, gets the next scan- or write-range to process and
-    //      re-enqueues the request.
-    //   3. Perform the read or write as specified.
-    // Cancellation checking needs to happen in both steps 1 and 3.
-
-    while (!_shut_down) {
-        RequestContext* worker_context = nullptr;
-        ;
-        RequestRange* range = nullptr;
-
-        if (!get_next_request_range(disk_queue, &range, &worker_context)) {
-            DCHECK(_shut_down);
-            break;
-        }
-
-        if (range->request_type() == RequestType::READ) {
-            read_range(disk_queue, worker_context, static_cast<ScanRange*>(range));
-        } else {
-            DCHECK(range->request_type() == RequestType::WRITE);
-            write(worker_context, static_cast<WriteRange*>(range));
-        }
-    }
-
-    DCHECK(_shut_down);
-}
-
-// This function reads the specified scan range associated with the
-// specified reader context and disk queue.
-void DiskIoMgr::read_range(DiskQueue* disk_queue, RequestContext* reader, ScanRange* range) {
-    char* buffer = nullptr;
-    int64_t bytes_remaining = range->_len - range->_bytes_read;
-    DCHECK_GT(bytes_remaining, 0);
-    int64_t buffer_size = std::min(bytes_remaining, static_cast<int64_t>(_max_buffer_size));
-    bool enough_memory = _mem_tracker->spare_capacity() > LOW_MEMORY;
-    if (!enough_memory) {
-        // Low memory, GC and try again.
-        gc_io_buffers();
-        enough_memory = _mem_tracker->spare_capacity() > LOW_MEMORY;
-    }
-
-    if (!enough_memory) {
-        RequestContext::PerDiskState& state = reader->_disk_states[disk_queue->disk_id];
-        unique_lock<mutex> reader_lock(reader->_lock);
-
-        // Just grabbed the reader lock, check for cancellation.
-        if (reader->_state == RequestContext::Cancelled) {
-            DCHECK(reader->validate()) << endl << reader->debug_string();
-            state.decrement_request_thread_and_check_done(reader);
-            range->cancel(reader->_status);
-            DCHECK(reader->validate()) << endl << reader->debug_string();
-            return;
-        }
-
-        if (!range->_ready_buffers.empty()) {
-            // We have memory pressure and this range doesn't need another buffer
-            // (it already has one queued). Skip this range and pick it up later.
-            range->_blocked_on_queue = true;
-            reader->_blocked_ranges.enqueue(range);
-            state.decrement_request_thread();
-            return;
-        } else {
-            // We need to get a buffer anyway since there are none queued. The query
-            // is likely to fail due to mem limits but there's nothing we can do about that
-            // now.
-        }
-    }
-
-    buffer = get_free_buffer(&buffer_size);
-    ++reader->_num_used_buffers;
-
-    // Validate more invariants.
-    DCHECK_GT(reader->_num_used_buffers, 0);
-    DCHECK(range != nullptr);
-    DCHECK(reader != nullptr);
-    DCHECK(buffer != nullptr);
-
-    BufferDescriptor* buffer_desc = get_buffer_desc(reader, range, buffer, buffer_size);
-    DCHECK(buffer_desc != nullptr);
-
-    // No locks in this section.  Only working on local vars.  We don't want to hold a
-    // lock across the read call.
-    buffer_desc->_status = range->open();
-    if (buffer_desc->_status.ok()) {
-        // Update counters.
-        if (reader->_active_read_thread_counter) {
-            reader->_active_read_thread_counter->update(1L);
-        }
-        if (reader->_disks_accessed_bitmap) {
-            int64_t disk_bit = 1 << disk_queue->disk_id;
-            reader->_disks_accessed_bitmap->bit_or(disk_bit);
-        }
-        SCOPED_TIMER(&_read_timer);
-        SCOPED_TIMER(reader->_read_timer);
-
-        buffer_desc->_status = range->read(buffer, &buffer_desc->_len, &buffer_desc->_eosr);
-        buffer_desc->_scan_range_offset = range->_bytes_read - buffer_desc->_len;
-
-        if (reader->_bytes_read_counter != nullptr) {
-            COUNTER_UPDATE(reader->_bytes_read_counter, buffer_desc->_len);
-        }
-
-        COUNTER_UPDATE(&_total_bytes_read_counter, buffer_desc->_len);
-        if (reader->_active_read_thread_counter) {
-            reader->_active_read_thread_counter->update(-1L);
-        }
-    }
-
-    // Finished read, update reader/disk based on the results
-    handle_read_finished(disk_queue, reader, buffer_desc);
-}
-
-void DiskIoMgr::write(RequestContext* writer_context, WriteRange* write_range) {
-    FILE* file_handle = fopen(write_range->file(), "rb+");
-    Status ret_status;
-    if (file_handle == nullptr) {
-        stringstream error_msg;
-        error_msg << "fopen(" << write_range->_file << ", \"rb+\") failed with errno=" << errno
-                  << " description=" << get_str_err_msg();
-        ret_status = Status::InternalError(error_msg.str());
-    } else {
-        ret_status = write_range_helper(file_handle, write_range);
-
-        int success = fclose(file_handle);
-        if (ret_status.ok() && success != 0) {
-            ret_status = Status::InternalError("fclose({}) failed", write_range->_file);
-        }
-    }
-
-    handle_write_finished(writer_context, write_range, ret_status);
-}
-
-Status DiskIoMgr::write_range_helper(FILE* file_handle, WriteRange* write_range) {
-    // Seek to the correct offset and perform the write.
-    int success = fseek(file_handle, write_range->offset(), SEEK_SET);
-    if (success != 0) {
-        return Status::InternalError("fseek({}, {} SEEK_SET) failed with errno={} description={}",
-                                     write_range->_file, write_range->offset(), errno,
-                                     get_str_err_msg());
-    }
-
-    int64_t bytes_written = fwrite(write_range->_data, 1, write_range->_len, file_handle);
-    if (bytes_written < write_range->_len) {
-        return Status::InternalError(
-                "fwrite(buffer, 1, {}, {}) failed with errno={} description={}", write_range->_len,
-                write_range->_file, errno, get_str_err_msg());
-    }
-
-    return Status::OK();
-}
-
-int DiskIoMgr::free_buffers_idx(int64_t buffer_size) {
-    int64_t buffer_size_scaled = bit_ceil(buffer_size, _min_buffer_size);
-    int idx = bit_log2(buffer_size_scaled);
-    DCHECK_GE(idx, 0);
-    DCHECK_LT(idx, _free_buffers.size());
-    return idx;
-}
-
-Status DiskIoMgr::add_write_range(RequestContext* writer, WriteRange* write_range) {
-    DCHECK_LE(write_range->len(), _max_buffer_size);
-    unique_lock<mutex> writer_lock(writer->_lock);
-
-    if (writer->_state == RequestContext::Cancelled) {
-        DCHECK(!writer->_status.ok());
-        return writer->_status;
-    }
-
-    writer->add_request_range(write_range, false);
-    return Status::OK();
-}
-
-/*
- * int DiskIoMgr::AssignQueue(const char* file, int disk_id, bool expected_local) {
- *   // If it's a remote range, check for an appropriate remote disk queue.
- *   if (!expected_local) {
- *     if (IsDfsPath(file) && FLAGS_num_remote_hdfs_io_threads > 0) return RemoteDfsDiskId();
- *     if (IsS3APath(file)) return RemoteS3DiskId();
- *   }
- *   // Assign to a local disk queue.
- *   DCHECK(!IsS3APath(file)); // S3 is always remote.
- *   if (disk_id == -1) {
- *     // disk id is unknown, assign it a random one.
- *     static int next_disk_id = 0;
- *     disk_id = next_disk_id++;
- *   }
- *   // TODO: we need to parse the config for the number of dirs configured for this
- *   // data node.
- *   return disk_id % num_local_disks();
- * }
- */
-
-/*
- * DiskIoMgr::HdfsCachedFileHandle* DiskIoMgr::OpenHdfsFile(const hdfsFS& fs,
- *     const char* fname, int64_t mtime) {
- *   HdfsCachedFileHandle* fh = nullptr;
- *
- *   // Check if a cached file handle exists and validate the mtime, if the mtime of the
- *   // cached handle is not matching the mtime of the requested file, reopen.
- *   if (detail::is_file_handle_caching_enabled() && _file_handle_cache.Pop(fname, &fh)) {
- *     if (fh->mtime() == mtime) {
- *       return fh;
- *     }
- *     VLOG_FILE << "mtime mismatch, closing cached file handle. Closing file=" << fname;
- *     delete fh;
- *   }
- *
- *   fh = new HdfsCachedFileHandle(fs, fname, mtime);
- *
- *   // Check if the file handle was opened correctly
- *   if (!fh->ok())  {
- *     VLOG_FILE << "Opening the file " << fname << " failed.";
- *     delete fh;
- *     return nullptr;
- *   }
- *
- *   return fh;
- * }
- */
-
-/*
- * void DiskIoMgr::cache_or_close_file_handle(const char* fname,
- *     DiskIoMgr::HdfsCachedFileHandle* fid, bool close) {
- *   // Try to unbuffer the handle, on filesystems that do not support this call a non-zero
- *   // return code indicates that the operation was not successful and thus the file is
- *   // closed.
- *   if (detail::is_file_handle_caching_enabled() &&
- *       !close && hdfsUnbufferFile(fid->file()) == 0) {
- *     // Clear read statistics before returning
- *     hdfsFileClearReadStatistics(fid->file());
- *     _file_handle_cache.Put(fname, fid);
- *   } else {
- *     if (close) {
- *       VLOG_FILE << "Closing file=" << fname;
- *     } else {
- *       VLOG_FILE << "FS does not support file handle unbuffering, closing file="
- *                 << fname;
- *     }
- *     delete fid;
- *   }
- * }
- */
-
-} // namespace doris
diff --git a/be/src/runtime/disk_io_mgr.h b/be/src/runtime/disk_io_mgr.h
deleted file mode 100644
index 9d0aa2f5ae..0000000000
--- a/be/src/runtime/disk_io_mgr.h
+++ /dev/null
@@ -1,837 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/disk-io-mgr.h
-// and modified by Doris
-
-#pragma once
-
-#include <condition_variable>
-#include <list>
-#include <mutex>
-#include <thread>
-#include <unordered_set>
-#include <vector>
-
-#include "common/config.h"
-#include "common/object_pool.h"
-#include "common/status.h"
-#include "runtime/memory/mem_tracker_limiter.h"
-#include "util/error_util.h"
-#include "util/internal_queue.h"
-#include "util/metrics.h"
-#include "util/runtime_profile.h"
-#include "util/thread_group.h"
-
-namespace doris {
-
-class MemTracker;
-
-// Manager object that schedules IO for all queries on all disks and remote filesystems
-// (such as S3). Each query maps to one or more RequestContext objects, each of which
-// has its own queue of scan ranges and/or write ranges.
-//
-// The API splits up requesting scan/write ranges (non-blocking) and reading the data
-// (blocking). The DiskIoMgr has worker threads that will read from and write to
-// disk/hdfs/remote-filesystems, allowing interleaving of IO and CPU. This allows us to
-// keep all disks and all cores as busy as possible.
-//
-// All public APIs are thread-safe. It is not valid to call any of the APIs after
-// unregister_context() returns.
-//
-// For Readers:
-// We can model this problem as a multiple producer (threads for each disk), multiple
-// consumer (scan ranges) problem. There are multiple queues that need to be
-// synchronized. Conceptually, there are two queues:
-//   1. The per disk queue: this contains a queue of readers that need reads.
-//   2. The per scan range ready-buffer queue: this contains buffers that have been
-//      read and are ready for the caller.
-// The disk queue contains a queue of readers and is scheduled in a round robin fashion.
-// Readers map to scan nodes. The reader then contains a queue of scan ranges. The caller
-// asks the IoMgr for the next range to process. The IoMgr then selects the best range
-// to read based on disk activity and begins reading and queuing buffers for that range.
-// TODO: We should map readers to queries. A reader is the unit of scheduling and queries
-// that have multiple scan nodes shouldn't have more 'turns'.
-//
-// For Writers:
-// Data is written via add_write_range(). This is non-blocking and adds a WriteRange to a
-// per-disk queue. After the write is complete, a callback in WriteRange is invoked.
-// No memory is allocated within IoMgr for writes and no copies are made. It is the
-// responsibility of the client to ensure that the data to be written is valid and that
-// the file to be written to exists until the callback is invoked.
-//
-// The IoMgr provides three key APIs.
-//  1. add_scan_ranges: this is non-blocking and tells the IoMgr all the ranges that
-//     will eventually need to be read.
-//  2. get_next_range: returns to the caller the next scan range it should process.
-//     This is based on disk load. This also begins reading the data in this scan
-//     range. This is blocking.
-//  3. ScanRange::get_next: returns the next buffer for this range.  This is blocking.
-//
-// The disk threads do not synchronize with each other. The readers and writers don't
-// synchronize with each other. There is a lock and condition variable for each request
-// context queue and each disk queue.
-// IMPORTANT: whenever both locks are needed, the lock order is to grab the context lock
-// before the disk lock.
-//
-// Scheduling: If there are multiple request contexts with work for a single disk, the
-// request contexts are scheduled in round-robin order. Multiple disk threads can
-// operate on the same request context. Exactly one request range is processed by a
-// disk thread at a time. If there are multiple scan ranges scheduled via
-// get_next_range() for a single context, these are processed in round-robin order.
-// If there are multiple scan and write ranges for a disk, a read is always followed
-// by a write, and a write is followed by a read, i.e. reads and writes alternate.
-// If multiple write ranges are enqueued for a single disk, they will be processed
-// by the disk threads in order, but may complete in any order. No guarantees are made
-// on ordering of writes across disks.
-//
-// Resource Management: effective resource management in the IoMgr is key to good
-// performance. The IoMgr helps coordinate two resources: CPU and disk. For CPU,
-// spinning up too many threads causes thrashing.
-// Memory usage in the IoMgr comes from queued read buffers.  If we queue the minimum
-// (i.e. 1), then the disks are idle while we are processing the buffer. If we don't
-// limit the queue, then it possible we end up queueing the entire data set (i.e. CPU
-// is slower than disks) and run out of memory.
-// For both CPU and memory, we want to model the machine as having a fixed amount of
-// resources.  If a single query is running, it should saturate either CPU or Disk
-// as well as using as little memory as possible. With multiple queries, each query
-// should get less CPU. In that case each query will need fewer queued buffers and
-// therefore have less memory usage.
-//
-// The IoMgr defers CPU management to the caller. The IoMgr provides a get_next_range
-// API which will return the next scan range the caller should process. The caller
-// can call this from the desired number of reading threads. Once a scan range
-// has been returned via get_next_range, the IoMgr will start to buffer reads for
-// that range and it is expected the caller will pull those buffers promptly. For
-// example, if the caller would like to have 1 scanner thread, the read loop
-// would look like:
-//   while (more_ranges)
-//     range = get_next_range()
-//     while (!range.eosr)
-//       buffer = range.get_next()
-// To have multiple reading threads, the caller would simply spin up the threads
-// and each would process the loops above.
-//
-// To control the number of IO buffers, each scan range has a soft max capacity for
-// the number of queued buffers. If the number of buffers is at capacity, the IoMgr
-// will no longer read for that scan range until the caller has processed a buffer.
-// This capacity does not need to be fixed, and the caller can dynamically adjust
-// it if necessary.
-//
-// As an example: If we allowed 5 buffers per range on a 24 core, 72 thread
-// (we default to allowing 3x threads) machine, we should see at most
-// 72 * 5 * 8MB = 2.8GB in io buffers memory usage. This should remain roughly constant
-// regardless of how many concurrent readers are running.
-//
-// Buffer Management:
-// Buffers are allocated by the IoMgr as necessary to service reads. These buffers
-// are directly returned to the caller. The caller must call Return() on the buffer
-// when it is done, at which point the buffer will be recycled for another read. In error
-// cases, the IoMgr will recycle the buffers more promptly but regardless, the caller
-// must always call Return()
-//
-// Caching support:
-// Scan ranges contain metadata on whether or not it is cached on the DN. In that
-// case, we use the HDFS APIs to read the cached data without doing any copies. For these
-// ranges, the reads happen on the caller thread (as opposed to the disk threads).
-// It is possible for the cached read APIs to fail, in which case the ranges are then
-// queued on the disk threads and behave identically to the case where the range
-// is not cached.
-// Resources for these ranges are also not accounted against the reader because none
-// are consumed.
-// While a cached block is being processed, the block is mlocked. We want to minimize
-// the time the mlock is held.
-//   - HDFS will time us out if we hold onto the mlock for too long
-//   - Holding the lock prevents uncaching this file due to a caching policy change.
-// Therefore, we only issue the cached read when the caller is ready to process the
-// range (get_next_range()) instead of when the ranges are issued. This guarantees that
-// there will be a CPU available to process the buffer and any throttling we do with
-// the number of scanner threads properly controls the amount of files we mlock.
-// With cached scan ranges, we cannot close the scan range until the cached buffer
-// is returned (HDFS does not allow this). We therefore need to defer the close until
-// the cached buffer is returned (BufferDescriptor::Return()).
-//
-// Remote filesystem support (e.g. S3):
-// Remote filesystems are modeled as "remote disks". That is, there is a separate disk
-// queue for each supported remote filesystem type. In order to maximize throughput,
-// multiple connections are opened in parallel by having multiple threads running per
-// queue. Also note that reading from a remote filesystem service can be more CPU
-// intensive than local disk/hdfs because of non-direct I/O and SSL processing, and can
-// be CPU bottlenecked especially if not enough I/O threads for these queues are
-// started.
-//
-// TODO: IoMgr should be able to request additional scan ranges from the coordinator
-// to help deal with stragglers.
-// TODO: look into using a lock free queue
-// TODO: simplify the common path (less locking, memory allocations).
-// TODO: Break this up the .h/.cc into multiple files under an /io subdirectory.
-//
-// Structure of the Implementation:
-//  - All client APIs are defined in this file
-//  - Internal classes are defined in disk-io-mgr-internal.h
-//  - ScanRange APIs are implemented in disk-io-mgr-scan-range.cc
-//    This contains the ready buffer queue logic
-//  - RequestContext APIs are implemented in disk-io-mgr-reader-context.cc
-//    This contains the logic for picking scan ranges for a reader.
-//  - Disk Thread and general APIs are implemented in disk-io-mgr.cc.
-
-typedef void* hdfsFS;
-typedef void* hdfsFile;
-
-class DiskIoMgr {
-public:
-    class RequestContext;
-    class ScanRange;
-
-    // This class is a small wrapper around the hdfsFile handle and the file system
-    // instance which is needed to close the file handle in case of eviction. It
-    // additionally encapsulates the last modified time of the associated file when it was
-    // last opened.
-    class HdfsCachedFileHandle {
-    public:
-        // Constructor will open the file
-        HdfsCachedFileHandle(const hdfsFS& fs, const char* fname, int64_t mtime);
-
-        // Destructor will close the file handle
-        ~HdfsCachedFileHandle();
-
-        hdfsFile file() const { return _hdfs_file; }
-
-        int64_t mtime() const { return _mtime; }
-
-        // This method is called to release acquired resources by the cached handle when it
-        // is evicted.
-        static void release(HdfsCachedFileHandle** h);
-
-        bool ok() const { return _hdfs_file != nullptr; }
-
-    private:
-        hdfsFS _fs;
-        hdfsFile _hdfs_file;
-        int64_t _mtime;
-    };
-
-    // Buffer struct that is used by the caller and IoMgr to pass read buffers.
-    // It is expected that only one thread has ownership of this object at a
-    // time.
-    class BufferDescriptor {
-    public:
-        // a null dtor to pass codestyle check
-        ~BufferDescriptor() {}
-
-        ScanRange* scan_range() { return _scan_range; }
-        char* buffer() { return _buffer; }
-        int64_t buffer_len() { return _buffer_len; }
-        int64_t len() { return _len; }
-        bool eosr() { return _eosr; }
-
-        // Returns the offset within the scan range that this buffer starts at
-        int64_t scan_range_offset() const { return _scan_range_offset; }
-
-        // Returns the buffer to the IoMgr. This must be called for every buffer
-        // returned by get_next()/read() that did not return an error. This is non-blocking.
-        // After calling this, the buffer descriptor is invalid and cannot be accessed.
-        void return_buffer();
-
-    private:
-        friend class DiskIoMgr;
-        BufferDescriptor(DiskIoMgr* io_mgr);
-
-        // Resets the buffer descriptor state for a new reader, range and data buffer.
-        void reset(RequestContext* reader, ScanRange* range, char* buffer, int64_t buffer_len);
-
-        DiskIoMgr* _io_mgr;
-
-        // Reader that this buffer is for
-        RequestContext* _reader;
-
-        // Scan range that this buffer is for.
-        ScanRange* _scan_range;
-
-        // buffer with the read contents
-        char* _buffer;
-
-        // length of _buffer. For buffers from cached reads, the length is 0.
-        int64_t _buffer_len;
-
-        // length of read contents
-        int64_t _len;
-
-        // true if the current scan range is complete
-        bool _eosr;
-
-        // Status of the read to this buffer. if status is not ok, 'buffer' is nullptr
-        Status _status;
-
-        int64_t _scan_range_offset;
-    };
-
-    // The request type, read or write associated with a request range.
-    struct RequestType {
-        enum type {
-            READ,
-            WRITE,
-        };
-    };
-
-    // Represents a contiguous sequence of bytes in a single file.
-    // This is the common base class for read and write IO requests - ScanRange and
-    // WriteRange. Each disk thread processes exactly one RequestRange at a time.
-    class RequestRange : public InternalQueue<RequestRange>::Node {
-    public:
-        // hdfsFS fs() const { return _fs; }
-        const char* file() const { return _file.c_str(); }
-        int64_t offset() const { return _offset; }
-        int64_t len() const { return _len; }
-        int disk_id() const { return _disk_id; }
-        RequestType::type request_type() const { return _request_type; }
-
-    protected:
-        // Hadoop filesystem that contains _file, or set to nullptr for local filesystem.
-        hdfsFS _fs;
-
-        // Path to file being read or written.
-        std::string _file;
-
-        // Offset within _file being read or written.
-        int64_t _offset;
-
-        // Length of data read or written.
-        int64_t _len;
-
-        // Id of disk containing byte range.
-        int _disk_id;
-
-        // The type of IO request, READ or WRITE.
-        RequestType::type _request_type;
-    };
-
-    // ScanRange description. The caller must call Reset() to initialize the fields
-    // before calling add_scan_ranges(). The private fields are used internally by
-    // the IoMgr.
-    class ScanRange : public RequestRange {
-    public:
-        // If the mtime is set to NEVER_CACHE, the file handle should never be cached.
-        const static int64_t NEVER_CACHE = -1;
-
-        // The initial queue capacity for this.  Specify -1 to use IoMgr default.
-        ScanRange() : ScanRange(-1) {}
-        ScanRange(int initial_capacity);
-
-        virtual ~ScanRange();
-
-        // Resets this scan range object with the scan range description.  The scan range
-        // must fall within the file bounds (offset >= 0 and offset + len <= file_length).
-        // Resets this scan range object with the scan range description.
-        void reset(hdfsFS fs, const char* file, int64_t len, int64_t offset, int disk_id,
-                   bool try_cache, bool expected_local, int64_t mtime, void* metadata = nullptr);
-
-        void* meta_data() const { return _meta_data; }
-        // bool try_cache() const { return _try_cache; }
-        bool expected_local() const { return _expected_local; }
-        int ready_buffers_capacity() const { return _ready_buffers_capacity; }
-
-        // Returns the next buffer for this scan range. buffer is an output parameter.
-        // This function blocks until a buffer is ready or an error occurred. If this is
-        // called when all buffers have been returned, *buffer is set to nullptr and Status::OK()
-        // is returned.
-        // Only one thread can be in get_next() at any time.
-        Status get_next(BufferDescriptor** buffer);
-
-        // Cancel this scan range. This cleans up all queued buffers and
-        // wakes up any threads blocked on get_next().
-        // Status is the reason the range was cancelled. Must not be ok().
-        // Status is returned to the user in get_next().
-        void cancel(const Status& status);
-
-        // return a descriptive string for debug.
-        std::string debug_string() const;
-
-        int64_t mtime() const { return _mtime; }
-
-    private:
-        friend class DiskIoMgr;
-
-        // Initialize internal fields
-        void init_internal(DiskIoMgr* io_mgr, RequestContext* reader);
-
-        // Enqueues a buffer for this range. This does not block.
-        // Returns true if this scan range has hit the queue capacity, false otherwise.
-        // The caller passes ownership of buffer to the scan range and it is not
-        // valid to access buffer after this call.
-        bool enqueue_buffer(BufferDescriptor* buffer);
-
-        // Cleanup any queued buffers (i.e. due to cancellation). This cannot
-        // be called with any locks taken.
-        void cleanup_queued_buffers();
-
-        // Validates the internal state of this range. _lock must be taken
-        // before calling this.
-        bool validate();
-
-        // Maximum length in bytes for hdfsRead() calls.
-        int64_t max_read_chunk_size() const;
-
-        // Opens the file for this range. This function only modifies state in this range.
-        Status open();
-
-        // Closes the file for this range. This function only modifies state in this range.
-        void close();
-
-        // Reads from this range into 'buffer'. Buffer is preallocated. Returns the number
-        // of bytes read. Updates range to keep track of where in the file we are.
-        Status read(char* buffer, int64_t* bytes_read, bool* eosr);
-
-        // Reads from the DN cache. On success, sets _cached_buffer to the DN buffer
-        // and *read_succeeded to true.
-        // If the data is not cached, returns ok() and *read_succeeded is set to false.
-        // Returns a non-ok status if it ran into a non-continuable error.
-        Status read_from_cache(bool* read_succeeded);
-
-        // Pointer to caller specified metadata. This is untouched by the io manager
-        // and the caller can put whatever auxiliary data in here.
-        void* _meta_data;
-
-        // If true, this scan range is expected to be cached. Note that this might be wrong
-        // since the block could have been uncached. In that case, the cached path
-        // will fail and we'll just put the scan range on the normal read path.
-        bool _try_cache;
-
-        // If true, we expect this scan range to be a local read. Note that if this is false,
-        // it does not necessarily mean we expect the read to be remote, and that we never
-        // create scan ranges where some of the range is expected to be remote and some of it
-        // local.
-        // TODO: we can do more with this
-        bool _expected_local;
-
-        DiskIoMgr* _io_mgr;
-
-        // Reader/owner of the scan range
-        RequestContext* _reader;
-
-        // File handle either to hdfs or local fs (FILE*)
-        //
-        // TODO: The pointer to HdfsCachedFileHandle is manually managed and should be
-        // replaced by unique_ptr in C++11
-        union {
-            FILE* _local_file;
-            HdfsCachedFileHandle* _hdfs_file;
-        };
-
-        // If non-null, this is DN cached buffer. This means the cached read succeeded
-        // and all the bytes for the range are in this buffer.
-        // TODO(zxy) Not used, maybe delete
-        struct hadoopRzBuffer* _cached_buffer;
-
-        // Lock protecting fields below.
-        // This lock should not be taken during Open/Read/Close.
-        std::mutex _lock;
-
-        // Number of bytes read so far for this scan range
-        int _bytes_read;
-
-        // Status for this range. This is non-ok if _is_cancelled is true.
-        // Note: an individual range can fail without the RequestContext being
-        // cancelled. This allows us to skip individual ranges.
-        Status _status;
-
-        // If true, the last buffer for this scan range has been queued.
-        bool _eosr_queued;
-
-        // If true, the last buffer for this scan range has been returned.
-        bool _eosr_returned;
-
-        // If true, this scan range has been removed from the reader's in_flight_ranges
-        // queue because the _ready_buffers queue is full.
-        bool _blocked_on_queue;
-
-        // IO buffers that are queued for this scan range.
-        // Condition variable for get_next
-        std::condition_variable _buffer_ready_cv;
-        std::list<BufferDescriptor*> _ready_buffers;
-
-        // The soft capacity limit for _ready_buffers. _ready_buffers can exceed
-        // the limit temporarily as the capacity is adjusted dynamically.
-        // In that case, the capacity is only realized when the caller removes buffers
-        // from _ready_buffers.
-        int _ready_buffers_capacity;
-
-        // Lock that should be taken during hdfs calls. Only one thread (the disk reading
-        // thread) calls into hdfs at a time so this lock does not have performance impact.
-        // This lock only serves to coordinate cleanup. Specifically it serves to ensure
-        // that the disk threads are finished with HDFS calls before _is_cancelled is set
-        // to true and cleanup starts.
-        // If this lock and _lock need to be taken, _lock must be taken first.
-        std::mutex _hdfs_lock;
-
-        // If true, this scan range has been cancelled.
-        bool _is_cancelled;
-
-        // Last modified time of the file associated with the scan range
-        int64_t _mtime;
-    };
-
-    // Used to specify data to be written to a file and offset.
-    // It is the responsibility of the client to ensure that the data to be written is
-    // valid and that the file to be written to exists until the callback is invoked.
-    // A callback is invoked to inform the client when the write is done.
-    class WriteRange : public RequestRange {
-    public:
-        // a null dtor to pass codestyle check
-        ~WriteRange() {}
-
-        // This callback is invoked on each WriteRange after the write is complete or the
-        // context is cancelled. The status returned by the callback parameter indicates
-        // if the write was successful (i.e. Status::OK()), if there was an error
-        // TStatusCode::RUNTIME_ERROR) or if the context was cancelled
-        // (TStatusCode::CANCELLED). The callback is only invoked if this WriteRange was
-        // successfully added (i.e. add_write_range() succeeded). No locks are held while
-        // the callback is invoked.
-        typedef std::function<void(const Status&)> WriteDoneCallback;
-        WriteRange(const std::string& file, int64_t file_offset, int disk_id,
-                   WriteDoneCallback callback);
-
-        // Set the data and number of bytes to be written for this WriteRange.
-        // File data can be over-written by calling set_data() and add_write_range().
-        void set_data(const uint8_t* buffer, int64_t len);
-
-    private:
-        friend class DiskIoMgr;
-
-        // Data to be written. RequestRange::_len contains the length of data
-        // to be written.
-        const uint8_t* _data;
-
-        // Callback to invoke after the write is complete.
-        WriteDoneCallback _callback;
-    };
-
-    // Create a DiskIoMgr object.
-    //  - num_disks: The number of disks the IoMgr should use. This is used for testing.
-    //    Specify 0, to have the disk IoMgr query the os for the number of disks.
-    //  - threads_per_disk: number of read threads to create per disk. This is also
-    //    the max queue depth.
-    //  - min_buffer_size: minimum io buffer size (in bytes)
-    //  - max_buffer_size: maximum io buffer size (in bytes). Also the max read size.
-    DiskIoMgr(int num_disks, int threads_per_disk, int min_buffer_size, int max_buffer_size);
-
-    // Create DiskIoMgr with default configs.
-    DiskIoMgr();
-
-    // Clean up all threads and resources. This is mostly useful for testing since
-    // for impalad, this object is never destroyed.
-    ~DiskIoMgr();
-
-    // Initialize the IoMgr. Must be called once before any of the other APIs.
-    Status init(const int64_t mem_limit);
-
-    // Allocates tracking structure for a request context.
-    // Register a new request context which is returned in *request_context.
-    // The IoMgr owns the allocated RequestContext object. The caller must call
-    // unregister_context() for each context.
-    // reader_mem_tracker: Is non-null only for readers. IO buffers
-    //    used for this reader will be tracked by this. If the limit is exceeded
-    //    the reader will be cancelled and MEM_LIMIT_EXCEEDED will be returned via
-    //    get_next().
-    Status register_context(RequestContext** request_context);
-
-    // Unregisters context from the disk IoMgr. This must be called for every
-    // register_context() regardless of cancellation and must be called in the
-    // same thread as get_next()
-    // The 'context' cannot be used after this call.
-    // This call blocks until all the disk threads have finished cleaning up.
-    // unregister_context also cancels the reader/writer from the disk IoMgr.
-    void unregister_context(RequestContext* context);
-
-    // This function cancels the context asynchronously. All outstanding requests
-    // are aborted and tracking structures cleaned up. This does not need to be
-    // called if the context finishes normally.
-    // This will also fail any outstanding get_next()/Read requests.
-    // If wait_for_disks_completion is true, wait for the number of active disks for this
-    // context to reach 0. After calling with wait_for_disks_completion = true, the only
-    // valid API is returning IO buffers that have already been returned.
-    // Takes context->_lock if wait_for_disks_completion is true.
-    void cancel_context(RequestContext* context, bool wait_for_disks_completion = false);
-
-    // Adds the scan ranges to the queues. This call is non-blocking. The caller must
-    // not deallocate the scan range pointers before unregister_context().
-    // If schedule_immediately, the ranges are immediately put on the read queue
-    // (i.e. the caller should not/cannot call get_next_range for these ranges).
-    // This can be used to do synchronous reads as well as schedule dependent ranges,
-    // as in the case for columnar formats.
-    Status add_scan_ranges(RequestContext* reader, const std::vector<ScanRange*>& ranges,
-                           bool schedule_immediately = false);
-
-    // Add a WriteRange for the writer. This is non-blocking and schedules the context
-    // on the IoMgr disk queue. Does not create any files.
-    Status add_write_range(RequestContext* writer, WriteRange* write_range);
-
-    // Returns the next unstarted scan range for this reader. When the range is returned,
-    // the disk threads in the IoMgr will already have started reading from it. The
-    // caller is expected to call ScanRange::get_next on the returned range.
-    // If there are no more unstarted ranges, nullptr is returned.
-    // This call is blocking.
-    Status get_next_range(RequestContext* reader, ScanRange** range);
-
-    // Reads the range and returns the result in buffer.
-    // This behaves like the typical synchronous read() api, blocking until the data
-    // is read. This can be called while there are outstanding ScanRanges and is
-    // thread safe. Multiple threads can be calling read() per reader at a time.
-    // range *cannot* have already been added via add_scan_ranges.
-    Status read(RequestContext* reader, ScanRange* range, BufferDescriptor** buffer);
-
-    // Determine which disk queue this file should be assigned to.  Returns an index into
-    // _disk_queues.  The disk_id is the volume ID for the local disk that holds the
-    // files, or -1 if unknown.  Flag expected_local is true iff this impalad is
-    // co-located with the datanode for this file.
-    /*
-     * int AssignQueue(const char* file, int disk_id, bool expected_local);
-     */
-
-    // TODO: The functions below can be moved to RequestContext.
-    // Returns the current status of the context.
-    Status context_status(RequestContext* context) const;
-
-    // Returns the number of unstarted scan ranges for this reader.
-    int num_unstarted_ranges(RequestContext* reader) const;
-
-    void set_bytes_read_counter(RequestContext*, RuntimeProfile::Counter*);
-    void set_read_timer(RequestContext*, RuntimeProfile::Counter*);
-    void set_active_read_thread_counter(RequestContext*, RuntimeProfile::Counter*);
-    void set_disks_access_bitmap(RequestContext*, RuntimeProfile::Counter*);
-
-    int64_t queue_size(RequestContext* reader) const;
-    int64_t bytes_read_local(RequestContext* reader) const;
-    int64_t bytes_read_short_circuit(RequestContext* reader) const;
-    int64_t bytes_read_dn_cache(RequestContext* reader) const;
-    int num_remote_ranges(RequestContext* reader) const;
-    int64_t unexpected_remote_bytes(RequestContext* reader) const;
-    MemTrackerLimiter* mem_tracker() const { return _mem_tracker.get(); }
-
-    // Returns the read throughput across all readers.
-    // TODO: should this be a sliding window?  This should report metrics for the
-    // last minute, hour and since the beginning.
-    int64_t get_read_throughput();
-
-    // Returns the maximum read buffer size
-    int max_read_buffer_size() const { return _max_buffer_size; }
-
-    // Returns the total number of disk queues (both local and remote).
-    int num_total_disks() const { return _disk_queues.size(); }
-
-    // Returns the total number of remote "disk" queues.
-    int num_remote_disks() const { return REMOTE_NUM_DISKS; }
-
-    // Returns the number of local disks attached to the system.
-    int num_local_disks() const { return num_total_disks() - num_remote_disks(); }
-
-    // The disk ID (and therefore _disk_queues index) used for DFS accesses.
-    // int RemoteDfsDiskId() const { return num_local_disks() + REMOTE_DFS_DISK_OFFSET; }
-
-    // The disk ID (and therefore _disk_queues index) used for S3 accesses.
-    // int RemoteS3DiskId() const { return num_local_disks() + REMOTE_S3_DISK_OFFSET; }
-
-    // Returns the number of allocated buffers.
-    int num_allocated_buffers() const { return _num_allocated_buffers; }
-
-    // Returns the number of buffers currently owned by all readers.
-    int num_buffers_in_readers() const { return _num_buffers_in_readers; }
-
-    // Dumps the disk IoMgr queues (for readers and disks)
-    std::string debug_string();
-
-    // Validates the internal state is consistent. This is intended to only be used
-    // for debugging.
-    bool validate() const;
-
-    // Given a FS handle, name and last modified time of the file, tries to open that file
-    // and return an instance of HdfsCachedFileHandle. In case of an error returns nullptr.
-    // HdfsCachedFileHandle* OpenHdfsFile(const hdfsFS& fs, const char* fname, int64_t mtime);
-
-    // When the file handle is no longer in use by the scan range, return it and try to
-    // unbuffer the handle. If unbuffering, closing sockets and dropping buffers in the
-    // libhdfs client, is not supported, close the file handle. If the unbuffer operation
-    // is supported, put the file handle together with the mtime in the LRU cache for
-    // later reuse.
-    // void cache_or_close_file_handle(const char* fname, HdfsCachedFileHandle* fid, bool close);
-
-    // Default ready buffer queue capacity. This constant doesn't matter too much
-    // since the system dynamically adjusts.
-    static const int DEFAULT_QUEUE_CAPACITY;
-
-    // "Disk" queue offsets for remote accesses.  Offset 0 corresponds to
-    // disk ID (i.e. _disk_queue index) of num_local_disks().
-    enum { REMOTE_DFS_DISK_OFFSET = 0, REMOTE_S3_DISK_OFFSET, REMOTE_NUM_DISKS };
-
-private:
-    friend class BufferDescriptor;
-    struct DiskQueue;
-    class RequestContextCache;
-
-    // Pool to allocate BufferDescriptors.
-    ObjectPool _pool;
-
-    std::unique_ptr<MemTrackerLimiter> _mem_tracker;
-
-    // Number of worker(read) threads per disk. Also the max depth of queued
-    // work to the disk.
-    const int _num_threads_per_disk;
-
-    // Maximum read size. This is also the maximum size of each allocated buffer.
-    const int _max_buffer_size;
-
-    // The minimum size of each read buffer.
-    const int _min_buffer_size;
-
-    // Thread group containing all the worker threads.
-    // ThreadGroup _disk_thread_group;
-    ThreadGroup _disk_thread_group;
-
-    // Options object for cached hdfs reads. Set on startup and never modified.
-    struct hadoopRzOptions* _cached_read_options;
-
-    // True if the IoMgr should be torn down. Worker threads watch for this to
-    // know to terminate. This variable is read/written to by different threads.
-    std::atomic<bool> _shut_down;
-
-    // Total bytes read by the IoMgr.
-    RuntimeProfile::Counter _total_bytes_read_counter;
-
-    // Total time spent in hdfs reading
-    RuntimeProfile::Counter _read_timer;
-
-    // Contains all contexts that the IoMgr is tracking. This includes contexts that are
-    // active as well as those in the process of being cancelled. This is a cache
-    // of context objects that get recycled to minimize object allocations and lock
-    // contention.
-    std::unique_ptr<RequestContextCache> _request_context_cache;
-
-    // Protects _free_buffers and _free_buffer_descs
-    std::mutex _free_buffers_lock;
-
-    // Free buffers that can be handed out to clients. There is one list for each buffer
-    // size, indexed by the Log2 of the buffer size in units of _min_buffer_size. The
-    // maximum buffer size is _max_buffer_size, so the maximum index is
-    // Log2(_max_buffer_size / _min_buffer_size).
-    //
-    // E.g. if _min_buffer_size = 1024 bytes:
-    //  _free_buffers[0]  => list of free buffers with size 1024 B
-    //  _free_buffers[1]  => list of free buffers with size 2048 B
-    //  _free_buffers[10] => list of free buffers with size 1 MB
-    //  _free_buffers[13] => list of free buffers with size 8 MB
-    //  _free_buffers[n]  => list of free buffers with size 2^n * 1024 B
-    std::vector<std::list<char*>> _free_buffers;
-
-    // List of free buffer desc objects that can be handed out to clients
-    std::list<BufferDescriptor*> _free_buffer_descs;
-
-    // Total number of allocated buffers, used for debugging.
-    std::atomic<int> _num_allocated_buffers {0};
-
-    // Total number of buffers in readers
-    std::atomic<int> _num_buffers_in_readers {0};
-
-    // Per disk queues. This is static and created once at init() time.  One queue is
-    // allocated for each local disk on the system and for each remote filesystem type.
-    // It is indexed by disk id.
-    std::vector<DiskQueue*> _disk_queues;
-
-    // Caching structure that maps file names to cached file handles. The cache has an upper
-    // limit of entries defined by FLAGS_max_cached_file_handles. Evicted cached file
-    // handles are closed.
-    // FifoMultimap<std::string, HdfsCachedFileHandle*> _file_handle_cache;
-    std::multimap<std::string, HdfsCachedFileHandle*> _file_handle_cache;
-
-    // Returns the index into _free_buffers for a given buffer size
-    int free_buffers_idx(int64_t buffer_size);
-
-    // Gets a buffer description object, initialized for this reader, allocating one as
-    // necessary. buffer_size / _min_buffer_size should be a power of 2, and buffer_size
-    // should be <= _max_buffer_size. These constraints will be met if buffer was acquired
-    // via get_free_buffer() (which it should have been).
-    BufferDescriptor* get_buffer_desc(RequestContext* reader, ScanRange* range, char* buffer,
-                                      int64_t buffer_size);
-
-    // Returns a buffer desc object which can now be used for another reader.
-    void return_buffer_desc(BufferDescriptor* desc);
-
-    // Returns the buffer desc and underlying buffer to the disk IoMgr. This also updates
-    // the reader and disk queue state.
-    void return_buffer(BufferDescriptor* buffer);
-
-    // Returns a buffer to read into with size between *buffer_size and _max_buffer_size,
-    // and *buffer_size is set to the size of the buffer. If there is an
-    // appropriately-sized free buffer in the '_free_buffers', that is returned, otherwise
-    // a new one is allocated. *buffer_size must be between 0 and _max_buffer_size.
-    char* get_free_buffer(int64_t* buffer_size);
-
-    // Garbage collect all unused io buffers. This is currently only triggered when the
-    // process wide limit is hit.
-    // TODO: make this run periodically?
-    void gc_io_buffers(int64_t bytes_to_free = INT_MAX);
-
-    // Returns a buffer to the free list. buffer_size / _min_buffer_size should be a power
-    // of 2, and buffer_size should be <= _max_buffer_size. These constraints will be met
-    // if buffer was acquired via get_free_buffer() (which it should have been).
-    void return_free_buffer(char* buffer, int64_t buffer_size);
-
-    // Returns the buffer in desc (cannot be nullptr), sets buffer to nullptr
-    void return_free_buffer(BufferDescriptor* desc);
-
-    // Disk worker thread loop. This function retrieves the next range to process on
-    // the disk queue and invokes read_range() or Write() depending on the type of Range().
-    // There can be multiple threads per disk running this loop.
-    void work_loop(DiskQueue* queue);
-
-    // This is called from the disk thread to get the next range to process. It will
-    // wait until a scan range and buffer are available, or a write range is available.
-    // This functions returns the range to process.
-    // Only returns false if the disk thread should be shut down.
-    // No locks should be taken before this function call and none are left taken after.
-    bool get_next_request_range(DiskQueue* disk_queue, RequestRange** range,
-                                RequestContext** request_context);
-
-    // Updates disk queue and reader state after a read is complete. The read result
-    // is captured in the buffer descriptor.
-    void handle_read_finished(DiskQueue*, RequestContext*, BufferDescriptor*);
-
-    // Invokes write_range->_callback  after the range has been written and
-    // updates per-disk state and handle state. The status of the write OK/RUNTIME_ERROR
-    // etc. is passed via write_status and to the callback.
-    // The write_status does not affect the writer->_status. That is, an write error does
-    // not cancel the writer context - that decision is left to the callback handler.
-    // TODO: On the read path, consider not canceling the reader context on error.
-    void handle_write_finished(RequestContext* writer, WriteRange* write_range,
-                               const Status& write_status);
-
-    // Validates that range is correctly initialized
-    Status validate_scan_range(ScanRange* range);
-
-    // Write the specified range to disk and calls handle_write_finished when done.
-    // Responsible for opening and closing the file that is written.
-    void write(RequestContext* writer_context, WriteRange* write_range);
-
-    // Helper method to write a range using the specified FILE handle. Returns Status:OK
-    // if the write succeeded, or a RUNTIME_ERROR with an appropriate message otherwise.
-    // Does not open or close the file that is written.
-    Status write_range_helper(FILE* file_handle, WriteRange* write_range);
-
-    // Reads the specified scan range and calls handle_read_finished when done.
-    void read_range(DiskQueue* disk_queue, RequestContext* reader, ScanRange* range);
-};
-
-} // end namespace doris
diff --git a/be/src/runtime/disk_io_mgr_internal.h b/be/src/runtime/disk_io_mgr_internal.h
deleted file mode 100644
index 46b74aa946..0000000000
--- a/be/src/runtime/disk_io_mgr_internal.h
+++ /dev/null
@@ -1,455 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/disk-io-mgr-internal.h
-// and modified by Doris
-
-#pragma once
-
-#include <unistd.h>
-
-#include <queue>
-
-#include "common/logging.h"
-#include "common/status.h"
-#include "disk_io_mgr.h"
-#include "util/cpu_info.h"
-#include "util/debug_util.h"
-#include "util/disk_info.h"
-#include "util/filesystem_util.h"
-
-// This file contains internal structures to the IoMgr. Users of the IoMgr do
-// not need to include this file.
-namespace doris {
-
-// Per disk state
-struct DiskIoMgr::DiskQueue {
-    // Disk id (0-based)
-    int disk_id;
-
-    // Lock that protects access to 'request_contexts' and 'work_available'
-    std::mutex lock;
-
-    // Condition variable to signal the disk threads that there is work to do or the
-    // thread should shut down.  A disk thread will be woken up when there is a reader
-    // added to the queue. A reader is only on the queue when it has at least one
-    // scan range that is not blocked on available buffers.
-    std::condition_variable work_available;
-
-    // list of all request contexts that have work queued on this disk
-    std::list<RequestContext*> request_contexts;
-
-    // Enqueue the request context to the disk queue.  The DiskQueue lock must not be taken.
-    void enqueue_context(RequestContext* worker) {
-        {
-            std::unique_lock<std::mutex> disk_lock(lock);
-            // Check that the reader is not already on the queue
-            DCHECK(find(request_contexts.begin(), request_contexts.end(), worker) ==
-                   request_contexts.end());
-            request_contexts.push_back(worker);
-        }
-        work_available.notify_all();
-    }
-
-    DiskQueue(int id) : disk_id(id) {}
-};
-
-// Internal per request-context state. This object maintains a lot of state that is
-// carefully synchronized. The context maintains state across all disks as well as
-// per disk state.
-// The unit for an IO request is a RequestRange, which may be a ScanRange or a
-// WriteRange.
-// A scan range for the reader is on one of five states:
-// 1) PerDiskState's unstarted_ranges: This range has only been queued
-//    and nothing has been read from it.
-// 2) RequestContext's _ready_to_start_ranges: This range is about to be started.
-//    As soon as the reader picks it up, it will move to the in_flight_ranges
-//    queue.
-// 3) PerDiskState's in_flight_ranges: This range is being processed and will
-//    be read from the next time a disk thread picks it up in get_next_request_range()
-// 4) ScanRange's outgoing ready buffers is full. We can't read for this range
-//    anymore. We need the caller to pull a buffer off which will put this in
-//    the in_flight_ranges queue. These ranges are in the RequestContext's
-//    _blocked_ranges queue.
-// 5) ScanRange is cached and in the _cached_ranges queue.
-//
-// If the scan range is read and does not get blocked on the outgoing queue, the
-// transitions are: 1 -> 2 -> 3.
-// If the scan range does get blocked, the transitions are
-// 1 -> 2 -> 3 -> (4 -> 3)*
-//
-// In the case of a cached scan range, the range is immediately put in _cached_ranges.
-// When the caller asks for the next range to process, we first pull ranges from
-// the _cache_ranges queue. If the range was cached, the range is removed and
-// done (ranges are either entirely cached or not at all). If the cached read attempt
-// fails, we put the range in state 1.
-//
-// A write range for a context may be in one of two lists:
-// 1) _unstarted_write_ranges : Ranges that have been queued but not processed.
-// 2) _in_flight_ranges: The write range is ready to be processed by the next disk thread
-//    that picks it up in get_next_request_range().
-//
-// AddWriteRange() adds WriteRanges for a disk.
-// It is the responsibility of the client to pin the data to be written via a WriteRange
-// in memory. After a WriteRange has been written, a callback is invoked to inform the
-// client that the write has completed.
-//
-// An important assumption is that write does not exceed the maximum read size and that
-// the entire range is written when the write request is handled. (In other words, writes
-// are not broken up.)
-//
-// When a RequestContext is processed by a disk thread in get_next_request_range(), a write
-// range is always removed from the list of unstarted write ranges and appended to the
-// _in_flight_ranges queue. This is done to alternate reads and writes - a read that is
-// scheduled (by calling GetNextRange()) is always followed by a write (if one exists).
-// And since at most one WriteRange can be present in _in_flight_ranges at any time
-// (once a write range is returned from GetNetxRequestRange() it is completed and not
-// re-enqueued), a scan range scheduled via a call to GetNextRange() can be queued up
-// behind at most one write range.
-class DiskIoMgr::RequestContext {
-public:
-    enum State {
-        // Reader is initialized and maps to a client
-        Active,
-
-        // Reader is in the process of being cancelled.  Cancellation is coordinated between
-        // different threads and when they are all complete, the reader context is moved to
-        // the inactive state.
-        Cancelled,
-
-        // Reader context does not map to a client.  Accessing memory in this context
-        // is invalid (i.e. it is equivalent to a dangling pointer).
-        Inactive,
-    };
-
-    RequestContext(DiskIoMgr* parent, int num_disks);
-
-    // Resets this object.
-    void reset();
-
-    // Decrements the number of active disks for this reader.  If the disk count
-    // goes to 0, the disk complete condition variable is signaled.
-    // Reader lock must be taken before this call.
-    void decrement_disk_ref_count() {
-        // boost doesn't let us dcheck that the reader lock is taken
-        DCHECK_GT(_num_disks_with_ranges, 0);
-        if (--_num_disks_with_ranges == 0) {
-            _disks_complete_cond_var.notify_one();
-        }
-        DCHECK(validate()) << std::endl << debug_string();
-    }
-
-    // Reader & Disk Scheduling: Readers that currently can't do work are not on
-    // the disk's queue. These readers are ones that don't have any ranges in the
-    // in_flight_queue AND have not prepared a range by setting next_range_to_start.
-    // The rule to make sure readers are scheduled correctly is to ensure anytime a
-    // range is put on the in_flight_queue or anytime next_range_to_start is set to
-    // nullptr, the reader is scheduled.
-
-    // Adds range to in_flight_ranges, scheduling this reader on the disk threads
-    // if necessary.
-    // Reader lock must be taken before this.
-    void schedule_scan_range(DiskIoMgr::ScanRange* range) {
-        DCHECK_EQ(_state, Active);
-        DCHECK(range != nullptr);
-        RequestContext::PerDiskState& state = _disk_states[range->disk_id()];
-        state.in_flight_ranges()->enqueue(range);
-        state.schedule_context(this, range->disk_id());
-    }
-
-    // Cancels the context with status code 'status'.
-    void cancel(const Status& status);
-
-    // Adds request range to disk queue for this request context. Currently,
-    // schedule_immediately must be false is RequestRange is a write range.
-    void add_request_range(DiskIoMgr::RequestRange* range, bool schedule_immediately);
-
-    // Returns the default queue capacity for scan ranges. This is updated
-    // as the reader processes ranges.
-    int initial_scan_range_queue_capacity() const { return _initial_queue_capacity; }
-
-    // Validates invariants of reader.  Reader lock must be taken beforehand.
-    bool validate() const;
-
-    // Dumps out reader information.  Lock should be taken by caller
-    std::string debug_string() const;
-
-private:
-    friend class DiskIoMgr;
-    class PerDiskState;
-
-    // Parent object
-    DiskIoMgr* _parent;
-
-    // Total bytes read for this reader
-    RuntimeProfile::Counter* _bytes_read_counter;
-
-    // Total time spent in hdfs reading
-    RuntimeProfile::Counter* _read_timer;
-
-    // Number of active read threads
-    RuntimeProfile::Counter* _active_read_thread_counter;
-
-    // Disk access bitmap. The counter's bit[i] is set if disk id i has been accessed.
-    // TODO: we can only support up to 64 disks with this bitmap but it lets us use a
-    // builtin atomic instruction. Probably good enough for now.
-    RuntimeProfile::Counter* _disks_accessed_bitmap;
-
-    // Total number of bytes read locally, updated at end of each range scan
-    std::atomic<int64_t> _bytes_read_local {0};
-
-    // Total number of bytes read via short circuit read, updated at end of each range scan
-    std::atomic<int64_t> _bytes_read_short_circuit {0};
-
-    // Total number of bytes read from date node cache, updated at end of each range scan
-    std::atomic<int64_t> _bytes_read_dn_cache {0};
-
-    // Total number of bytes from remote reads that were expected to be local.
-    std::atomic<int64_t> _unexpected_remote_bytes {0};
-
-    // The number of buffers that have been returned to the reader (via get_next) that the
-    // reader has not returned. Only included for debugging and diagnostics.
-    std::atomic<int> _num_buffers_in_reader {0};
-
-    // The number of scan ranges that have been completed for this reader.
-    std::atomic<int> _num_finished_ranges {0};
-
-    // The number of scan ranges that required a remote read, updated at the end of each
-    // range scan. Only used for diagnostics.
-    std::atomic<int> _num_remote_ranges {0};
-
-    // The total number of scan ranges that have not been started. Only used for
-    // diagnostics. This is the sum of all unstarted_scan_ranges across all disks.
-    std::atomic<int> _num_unstarted_scan_ranges {0};
-
-    // The number of buffers that are being used for this reader. This is the sum
-    // of all buffers in ScanRange queues and buffers currently being read into (i.e. about
-    // to be queued).
-    std::atomic<int> _num_used_buffers {0};
-
-    // The total number of ready buffers across all ranges.  Ready buffers are buffers
-    // that have been read from disk but not retrieved by the caller.
-    // This is the sum of all queued buffers in all ranges for this reader context.
-    std::atomic<int> _num_ready_buffers {0};
-
-    // The total (sum) of queue capacities for finished scan ranges. This value
-    // divided by _num_finished_ranges is the average for finished ranges and
-    // used to seed the starting queue capacity for future ranges. The assumption
-    // is that if previous ranges were fast, new ones will be fast too. The scan
-    // range adjusts the queue capacity dynamically so a rough approximation will do.
-    std::atomic<int> _total_range_queue_capacity {0};
-
-    // The initial queue size for new scan ranges. This is always
-    // _total_range_queue_capacity / _num_finished_ranges but stored as a separate
-    // variable to allow reading this value without taking a lock. Doing the division
-    // at read time (with no lock) could lead to a race where only
-    // _total_range_queue_capacity or _num_finished_ranges was updated.
-    int _initial_queue_capacity;
-
-    // All fields below are accessed by multiple threads and the lock needs to be
-    // taken before accessing them.
-    std::mutex _lock;
-
-    // Current state of the reader
-    State _state;
-
-    // Status of this reader.  Set to non-ok if cancelled.
-    Status _status;
-
-    // The number of disks with scan ranges remaining (always equal to the sum of
-    // disks with ranges).
-    int _num_disks_with_ranges;
-
-    // This is the list of ranges that are expected to be cached on the DN.
-    // When the reader asks for a new range (GetNextScanRange()), we first
-    // return ranges from this list.
-    InternalQueue<ScanRange> _cached_ranges;
-
-    // A list of ranges that should be returned in subsequent calls to
-    // GetNextRange.
-    // There is a trade-off with when to populate this list.  Populating it on
-    // demand means consumers need to wait (happens in DiskIoMgr::GetNextRange()).
-    // Populating it preemptively means we make worse scheduling decisions.
-    // We currently populate one range per disk.
-    // TODO: think about this some more.
-    InternalQueue<ScanRange> _ready_to_start_ranges;
-    std::condition_variable _ready_to_start_ranges_cv; // used with _lock
-
-    // Ranges that are blocked due to back pressure on outgoing buffers.
-    InternalQueue<ScanRange> _blocked_ranges;
-
-    // Condition variable for UnregisterContext() to wait for all disks to complete
-    std::condition_variable _disks_complete_cond_var;
-
-    // Struct containing state per disk. See comments in the disk read loop on how
-    // they are used.
-    class PerDiskState {
-    public:
-        bool done() const { return _done; }
-        void set_done(bool b) { _done = b; }
-
-        int num_remaining_ranges() const { return _num_remaining_ranges; }
-        int& num_remaining_ranges() { return _num_remaining_ranges; }
-
-        ScanRange* next_scan_range_to_start() { return _next_scan_range_to_start; }
-        void set_next_scan_range_to_start(ScanRange* range) { _next_scan_range_to_start = range; }
-
-        // We need to have a memory barrier to prevent this load from being reordered
-        // with num_threads_in_op(), since these variables are set without the reader
-        // lock taken
-        bool is_on_queue() const {
-            bool b = _is_on_queue;
-            __sync_synchronize();
-            return b;
-        }
-
-        int num_threads_in_op() const {
-            int v = _num_threads_in_op;
-            __sync_synchronize();
-            return v;
-        }
-
-        const InternalQueue<ScanRange>* unstarted_scan_ranges() const {
-            return &_unstarted_scan_ranges;
-        }
-        const InternalQueue<WriteRange>* unstarted_write_ranges() const {
-            return &_unstarted_write_ranges;
-        }
-        const InternalQueue<RequestRange>* in_flight_ranges() const { return &_in_flight_ranges; }
-
-        InternalQueue<ScanRange>* unstarted_scan_ranges() { return &_unstarted_scan_ranges; }
-        InternalQueue<WriteRange>* unstarted_write_ranges() { return &_unstarted_write_ranges; }
-        InternalQueue<RequestRange>* in_flight_ranges() { return &_in_flight_ranges; }
-
-        PerDiskState() { reset(); }
-
-        // Schedules the request context on this disk if it's not already on the queue.
-        // Context lock must be taken before this.
-        void schedule_context(RequestContext* context, int disk_id) {
-            if (!_is_on_queue && !_done) {
-                _is_on_queue = true;
-                context->_parent->_disk_queues[disk_id]->enqueue_context(context);
-            }
-        }
-
-        // Increment the ref count on reader.  We need to track the number of threads per
-        // reader per disk that are in the unlocked hdfs read code section. This is updated
-        // by multiple threads without a lock so we need to use an atomic int.
-        void increment_request_thread_and_dequeue() {
-            ++_num_threads_in_op;
-            _is_on_queue = false;
-        }
-
-        void decrement_request_thread() { --_num_threads_in_op; }
-
-        // Decrement request thread count and do final cleanup if this is the last
-        // thread. RequestContext lock must be taken before this.
-        void decrement_request_thread_and_check_done(RequestContext* context) {
-            --_num_threads_in_op;
-            // We don't need to worry about reordered loads here because updating
-            // _num_threads_in_request uses an atomic, which is a barrier.
-            if (!_is_on_queue && _num_threads_in_op == 0 && !_done) {
-                // This thread is the last one for this reader on this disk, do final cleanup
-                context->decrement_disk_ref_count();
-                _done = true;
-            }
-        }
-
-        void reset() {
-            DCHECK(_in_flight_ranges.empty());
-            DCHECK(_unstarted_scan_ranges.empty());
-            DCHECK(_unstarted_write_ranges.empty());
-
-            _done = true;
-            _num_remaining_ranges = 0;
-            _is_on_queue = false;
-            _num_threads_in_op = 0;
-            _next_scan_range_to_start = nullptr;
-        }
-
-    private:
-        // If true, this disk is all done for this request context, including any cleanup.
-        // If done is true, it means that this request must not be on this disk's queue
-        // *AND* there are no threads currently working on this context. To satisfy
-        // this, only the last thread (per disk) can set this to true.
-        bool _done;
-
-        // For each disk, keeps track if the context is on this disk's queue, indicating
-        // the disk must do some work for this context. The disk needs to do work in 4 cases:
-        //  1) in_flight_ranges is not empty, the disk needs to read for this reader.
-        //  2) next_range_to_start is nullptr, the disk needs to prepare a scan range to be
-        //     read next.
-        //  3) the reader has been cancelled and this disk needs to participate in the
-        //     cleanup.
-        //  4) A write range is added to queue.
-        // In general, we only want to put a context on the disk queue if there is something
-        // useful that can be done. If there's nothing useful, the disk queue will wake up
-        // and then remove the reader from the queue. Doing this causes thrashing of the
-        // threads.
-        bool _is_on_queue;
-
-        // For each disks, the number of request ranges that have not been fully read.
-        // In the non-cancellation path, this will hit 0, and done will be set to true
-        // by the disk thread. This is undefined in the cancellation path (the various
-        // threads notice by looking at the RequestContext's _state).
-        int _num_remaining_ranges;
-
-        // Queue of ranges that have not started being read.  This list is exclusive
-        // with in_flight_ranges.
-        InternalQueue<ScanRange> _unstarted_scan_ranges;
-
-        // Queue of pending IO requests for this disk in the order that they will be
-        // processed. A ScanRange is added to this queue when it is returned in
-        // GetNextRange(), or when it is added with schedule_immediately = true.
-        // A WriteRange is added to this queue from _unstarted_write_ranges for each
-        // invocation of get_next_request_range() in WorkLoop().
-        // The size of this queue is always less than or equal to num_remaining_ranges.
-        InternalQueue<RequestRange> _in_flight_ranges;
-
-        // The next range to start for this reader on this disk. Each disk (for each reader)
-        // picks the next range to start. The range is set here and also added to the
-        // _ready_to_start_ranges queue. The reader pulls from the queue in FIFO order,
-        // so the ranges from different disks are round-robined. When the range is pulled
-        // off the _ready_to_start_ranges queue, it sets this variable to nullptr, so the disk
-        // knows to populate it again and add it to _ready_to_start_ranges i.e. it is used
-        // as a flag by DiskIoMgr::GetNextScanRange to determine if it needs to add another
-        // range to _ready_to_start_ranges.
-        ScanRange* _next_scan_range_to_start;
-
-        // For each disk, the number of threads issuing the underlying read/write on behalf
-        // of this context. There are a few places where we release the context lock, do some
-        // work, and then grab the lock again.  Because we don't hold the lock for the
-        // entire operation, we need this ref count to keep track of which thread should do
-        // final resource cleanup during cancellation.
-        // Only the thread that sees the count at 0 should do the final cleanup.
-        std::atomic<int> _num_threads_in_op {0};
-
-        // Queue of write ranges to process for this disk. A write range is always added
-        // to _in_flight_ranges in get_next_request_range(). There is a separate
-        // _unstarted_read_ranges and _unstarted_write_ranges to alternate between reads
-        // and writes. (Otherwise, since next_scan_range_to_start is set
-        // in get_next_request_range() whenever it is null, repeated calls to
-        // get_next_request_range() and GetNextRange() may result in only reads being processed)
-        InternalQueue<WriteRange> _unstarted_write_ranges;
-    };
-
-    // Per disk states to synchronize multiple disk threads accessing the same request
-    // context.
-    std::vector<PerDiskState> _disk_states;
-};
-
-} // namespace doris
diff --git a/be/src/runtime/disk_io_mgr_reader_context.cc b/be/src/runtime/disk_io_mgr_reader_context.cc
deleted file mode 100644
index 4ef71f1c94..0000000000
--- a/be/src/runtime/disk_io_mgr_reader_context.cc
+++ /dev/null
@@ -1,322 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/disk-io-mgr-reader-context.cc
-// and modified by Doris
-
-#include "runtime/disk_io_mgr_internal.h"
-
-namespace doris {
-
-using std::string;
-using std::stringstream;
-using std::vector;
-using std::list;
-using std::endl;
-
-using std::lock_guard;
-using std::unique_lock;
-using std::mutex;
-
-void DiskIoMgr::RequestContext::cancel(const Status& status) {
-    DCHECK(!status.ok());
-
-    // Callbacks are collected in this vector and invoked while no lock is held.
-    vector<WriteRange::WriteDoneCallback> write_callbacks;
-    {
-        lock_guard<mutex> lock(_lock);
-        DCHECK(validate()) << endl << debug_string();
-
-        // Already being cancelled
-        if (_state == RequestContext::Cancelled) {
-            return;
-        }
-
-        DCHECK(_status.ok());
-        _status = status;
-
-        // The reader will be put into a cancelled state until call cleanup is complete.
-        _state = RequestContext::Cancelled;
-
-        // Cancel all scan ranges for this reader. Each range could be one one of
-        // four queues.
-        for (int i = 0; i < _disk_states.size(); ++i) {
-            RequestContext::PerDiskState& state = _disk_states[i];
-            RequestRange* range = nullptr;
-            while ((range = state.in_flight_ranges()->dequeue()) != nullptr) {
-                if (range->request_type() == RequestType::READ) {
-                    static_cast<ScanRange*>(range)->cancel(status);
-                } else {
-                    DCHECK(range->request_type() == RequestType::WRITE);
-                    write_callbacks.push_back(static_cast<WriteRange*>(range)->_callback);
-                }
-            }
-
-            ScanRange* scan_range = nullptr;
-            while ((scan_range = state.unstarted_scan_ranges()->dequeue()) != nullptr) {
-                scan_range->cancel(status);
-            }
-            WriteRange* write_range = nullptr;
-            while ((write_range = state.unstarted_write_ranges()->dequeue()) != nullptr) {
-                write_callbacks.push_back(write_range->_callback);
-            }
-        }
-
-        ScanRange* range = nullptr;
-        while ((range = _ready_to_start_ranges.dequeue()) != nullptr) {
-            range->cancel(status);
-        }
-        while ((range = _blocked_ranges.dequeue()) != nullptr) {
-            range->cancel(status);
-        }
-        while ((range = _cached_ranges.dequeue()) != nullptr) {
-            range->cancel(status);
-        }
-
-        // Schedule reader on all disks. The disks will notice it is cancelled and do any
-        // required cleanup
-        for (int i = 0; i < _disk_states.size(); ++i) {
-            RequestContext::PerDiskState& state = _disk_states[i];
-            state.schedule_context(this, i);
-        }
-    }
-
-    for (const WriteRange::WriteDoneCallback& write_callback : write_callbacks) {
-        write_callback(_status);
-    }
-
-    // Signal reader and unblock the get_next/Read thread.  That read will fail with
-    // a cancelled status.
-    _ready_to_start_ranges_cv.notify_all();
-}
-
-void DiskIoMgr::RequestContext::add_request_range(DiskIoMgr::RequestRange* range,
-                                                  bool schedule_immediately) {
-    // DCHECK(_lock.is_locked()); // TODO: boost should have this API
-    RequestContext::PerDiskState& state = _disk_states[range->disk_id()];
-    if (state.done()) {
-        DCHECK_EQ(state.num_remaining_ranges(), 0);
-        state.set_done(false);
-        ++_num_disks_with_ranges;
-    }
-
-    bool schedule_context = false;
-    if (range->request_type() == RequestType::READ) {
-        DiskIoMgr::ScanRange* scan_range = static_cast<DiskIoMgr::ScanRange*>(range);
-        if (schedule_immediately) {
-            schedule_scan_range(scan_range);
-        } else {
-            state.unstarted_scan_ranges()->enqueue(scan_range);
-            ++_num_unstarted_scan_ranges;
-        }
-        // If next_scan_range_to_start is nullptr, schedule this RequestContext so that it will
-        // be set. If it's not nullptr, this context will be scheduled when GetNextRange() is
-        // invoked.
-        schedule_context = state.next_scan_range_to_start() == nullptr;
-    } else {
-        DCHECK(range->request_type() == RequestType::WRITE);
-        DCHECK(!schedule_immediately);
-        DiskIoMgr::WriteRange* write_range = static_cast<DiskIoMgr::WriteRange*>(range);
-        state.unstarted_write_ranges()->enqueue(write_range);
-
-        // schedule_context() has no effect if the context is already scheduled,
-        // so this is safe.
-        schedule_context = true;
-    }
-
-    if (schedule_context) {
-        state.schedule_context(this, range->disk_id());
-    }
-    ++state.num_remaining_ranges();
-}
-
-DiskIoMgr::RequestContext::RequestContext(DiskIoMgr* parent, int num_disks)
-        : _parent(parent),
-          _bytes_read_counter(nullptr),
-          _read_timer(nullptr),
-          _active_read_thread_counter(nullptr),
-          _disks_accessed_bitmap(nullptr),
-          _state(Inactive),
-          _disk_states(num_disks) {}
-
-// Resets this object.
-void DiskIoMgr::RequestContext::reset() {
-    DCHECK_EQ(_state, Inactive);
-    _status = Status::OK();
-
-    _bytes_read_counter = nullptr;
-    _read_timer = nullptr;
-    _active_read_thread_counter = nullptr;
-    _disks_accessed_bitmap = nullptr;
-
-    _state = Active;
-
-    _num_unstarted_scan_ranges = 0;
-    _num_disks_with_ranges = 0;
-    _num_used_buffers = 0;
-    _num_buffers_in_reader = 0;
-    _num_ready_buffers = 0;
-    _total_range_queue_capacity = 0;
-    _num_finished_ranges = 0;
-    _num_remote_ranges = 0;
-    _bytes_read_local = 0;
-    _bytes_read_short_circuit = 0;
-    _bytes_read_dn_cache = 0;
-    _unexpected_remote_bytes = 0;
-    _initial_queue_capacity = DiskIoMgr::DEFAULT_QUEUE_CAPACITY;
-
-    DCHECK(_ready_to_start_ranges.empty());
-    DCHECK(_blocked_ranges.empty());
-    DCHECK(_cached_ranges.empty());
-
-    for (int i = 0; i < _disk_states.size(); ++i) {
-        _disk_states[i].reset();
-    }
-}
-
-// Dumps out request context information. Lock should be taken by caller
-string DiskIoMgr::RequestContext::debug_string() const {
-    stringstream ss;
-    ss << endl << "  RequestContext: " << (void*)this << " (state=";
-    if (_state == RequestContext::Inactive) {
-        ss << "Inactive";
-    }
-    if (_state == RequestContext::Cancelled) ss << "Cancelled";
-    if (_state == RequestContext::Active) ss << "Active";
-    if (_state != RequestContext::Inactive) {
-        ss << " _status=" << _status << " #ready_buffers=" << _num_ready_buffers
-           << " #used_buffers=" << _num_used_buffers
-           << " #num_buffers_in_reader=" << _num_buffers_in_reader
-           << " #finished_scan_ranges=" << _num_finished_ranges
-           << " #disk_with_ranges=" << _num_disks_with_ranges
-           << " #disks=" << _num_disks_with_ranges;
-        for (int i = 0; i < _disk_states.size(); ++i) {
-            ss << endl
-               << "   " << i << ": "
-               << "is_on_queue=" << _disk_states[i].is_on_queue()
-               << " done=" << _disk_states[i].done()
-               << " #num_remaining_scan_ranges=" << _disk_states[i].num_remaining_ranges()
-               << " #in_flight_ranges=" << _disk_states[i].in_flight_ranges()->size()
-               << " #unstarted_scan_ranges=" << _disk_states[i].unstarted_scan_ranges()->size()
-               << " #unstarted_write_ranges=" << _disk_states[i].unstarted_write_ranges()->size()
-               << " #reading_threads=" << _disk_states[i].num_threads_in_op();
-        }
-    }
-    ss << ")";
-    return ss.str();
-}
-
-bool DiskIoMgr::RequestContext::validate() const {
-    if (_state == RequestContext::Inactive) {
-        LOG(WARNING) << "_state == RequestContext::Inactive";
-        return false;
-    }
-
-    if (_num_used_buffers < 0) {
-        LOG(WARNING) << "_num_used_buffers < 0: #used=" << _num_used_buffers;
-        return false;
-    }
-
-    if (_num_ready_buffers < 0) {
-        LOG(WARNING) << "_num_ready_buffers < 0: #used=" << _num_ready_buffers;
-        return false;
-    }
-
-    int total_unstarted_ranges = 0;
-    for (int i = 0; i < _disk_states.size(); ++i) {
-        const PerDiskState& state = _disk_states[i];
-        bool on_queue = state.is_on_queue();
-        int num_reading_threads = state.num_threads_in_op();
-
-        total_unstarted_ranges += state.unstarted_scan_ranges()->size();
-
-        if (num_reading_threads < 0) {
-            LOG(WARNING) << "disk_id=" << i
-                         << "state.num_threads_in_read < 0: #threads=" << num_reading_threads;
-            return false;
-        }
-
-        if (_state != RequestContext::Cancelled) {
-            if (state.unstarted_scan_ranges()->size() + state.in_flight_ranges()->size() >
-                state.num_remaining_ranges()) {
-                LOG(WARNING) << "disk_id=" << i
-                             << " state.unstarted_ranges.size() + state.in_flight_ranges.size()"
-                             << " > state.num_remaining_ranges:"
-                             << " #unscheduled=" << state.unstarted_scan_ranges()->size()
-                             << " #in_flight=" << state.in_flight_ranges()->size()
-                             << " #remaining=" << state.num_remaining_ranges();
-                return false;
-            }
-
-            // If we have an in_flight range, the reader must be on the queue or have a
-            // thread actively reading for it.
-            if (!state.in_flight_ranges()->empty() && !on_queue && num_reading_threads == 0) {
-                LOG(WARNING) << "disk_id=" << i
-                             << " reader has inflight ranges but is not on the disk queue."
-                             << " #in_flight_ranges=" << state.in_flight_ranges()->size()
-                             << " #reading_threads=" << num_reading_threads
-                             << " on_queue=" << on_queue;
-                return false;
-            }
-
-            if (state.done() && num_reading_threads > 0) {
-                LOG(WARNING) << "disk_id=" << i
-                             << " state set to done but there are still threads working."
-                             << " #reading_threads=" << num_reading_threads;
-                return false;
-            }
-        } else {
-            // Is Cancelled
-            if (!state.in_flight_ranges()->empty()) {
-                LOG(WARNING) << "disk_id=" << i << "Reader cancelled but has in flight ranges.";
-                return false;
-            }
-            if (!state.unstarted_scan_ranges()->empty()) {
-                LOG(WARNING) << "disk_id=" << i << "Reader cancelled but has unstarted ranges.";
-                return false;
-            }
-        }
-
-        if (state.done() && on_queue) {
-            LOG(WARNING) << "disk_id=" << i
-                         << " state set to done but the reader is still on the disk queue."
-                         << " state.done=true and state.is_on_queue=true";
-            return false;
-        }
-    }
-
-    if (_state != RequestContext::Cancelled) {
-        if (total_unstarted_ranges != _num_unstarted_scan_ranges) {
-            LOG(WARNING) << "total_unstarted_ranges=" << total_unstarted_ranges
-                         << " sum_in_states=" << _num_unstarted_scan_ranges;
-            return false;
-        }
-    } else {
-        if (!_ready_to_start_ranges.empty()) {
-            LOG(WARNING) << "Reader cancelled but has ready to start ranges.";
-            return false;
-        }
-        if (!_blocked_ranges.empty()) {
-            LOG(WARNING) << "Reader cancelled but has blocked ranges.";
-            return false;
-        }
-    }
-
-    return true;
-}
-
-} // namespace doris
diff --git a/be/src/runtime/disk_io_mgr_scan_range.cc b/be/src/runtime/disk_io_mgr_scan_range.cc
deleted file mode 100644
index 82962f4b3d..0000000000
--- a/be/src/runtime/disk_io_mgr_scan_range.cc
+++ /dev/null
@@ -1,481 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/disk-io-mgr-scan-range.cc
-// and modified by Doris
-
-#include "runtime/disk_io_mgr.h"
-#include "runtime/disk_io_mgr_internal.h"
-#include "util/error_util.h"
-
-using std::string;
-using std::stringstream;
-using std::vector;
-using std::list;
-using std::endl;
-
-using std::lock_guard;
-using std::unique_lock;
-using std::mutex;
-
-namespace doris {
-
-// A very large max value to prevent things from going out of control. Not
-// expected to ever hit this value (1GB of buffered data per range).
-const int MAX_QUEUE_CAPACITY = 128;
-const int MIN_QUEUE_CAPACITY = 2;
-
-// Implementation of the ScanRange functionality. Each ScanRange contains a queue
-// of ready buffers. For each ScanRange, there is only a single producer and
-// consumer thread, i.e. only one disk thread will push to a scan range at
-// any time and only one thread will remove from the queue. This is to guarantee
-// that buffers are queued and read in file order.
-
-// This must be called with the reader lock taken.
-bool DiskIoMgr::ScanRange::enqueue_buffer(BufferDescriptor* buffer) {
-    {
-        unique_lock<mutex> scan_range_lock(_lock);
-        DCHECK(validate()) << debug_string();
-        DCHECK(!_eosr_returned);
-        DCHECK(!_eosr_queued);
-        if (_is_cancelled) {
-            // Return the buffer, this range has been cancelled
-            if (buffer->_buffer != nullptr) {
-                ++_io_mgr->_num_buffers_in_readers;
-                ++_reader->_num_buffers_in_reader;
-            }
-            --_reader->_num_used_buffers;
-            buffer->return_buffer();
-            return false;
-        }
-        ++_reader->_num_ready_buffers;
-        _ready_buffers.push_back(buffer);
-        _eosr_queued = buffer->eosr();
-
-        _blocked_on_queue = _ready_buffers.size() >= _ready_buffers_capacity;
-        if (_blocked_on_queue && _ready_buffers_capacity > MIN_QUEUE_CAPACITY) {
-            // We have filled the queue, indicating we need back pressure on
-            // the producer side (i.e. we are pushing buffers faster than they
-            // are pulled off, throttle this range more).
-            --_ready_buffers_capacity;
-        }
-    }
-
-    _buffer_ready_cv.notify_one();
-
-    return _blocked_on_queue;
-}
-
-Status DiskIoMgr::ScanRange::get_next(BufferDescriptor** buffer) {
-    *buffer = nullptr;
-
-    {
-        unique_lock<mutex> scan_range_lock(_lock);
-        if (_eosr_returned) {
-            return Status::OK();
-        }
-        DCHECK(validate()) << debug_string();
-
-        if (_ready_buffers.empty()) {
-            // The queue is empty indicating this thread could use more
-            // IO. Increase the capacity to allow for more queueing.
-            ++_ready_buffers_capacity;
-            _ready_buffers_capacity = std::min(_ready_buffers_capacity, MAX_QUEUE_CAPACITY);
-        }
-
-        while (_ready_buffers.empty() && !_is_cancelled) {
-            _buffer_ready_cv.wait(scan_range_lock);
-        }
-
-        if (_is_cancelled) {
-            DCHECK(!_status.ok());
-            return _status;
-        }
-
-        // Remove the first ready buffer from the queue and return it
-        DCHECK(!_ready_buffers.empty());
-        *buffer = _ready_buffers.front();
-        _ready_buffers.pop_front();
-        _eosr_returned = (*buffer)->eosr();
-    }
-
-    // Update tracking counters. The buffer has now moved from the IoMgr to the
-    // caller.
-    ++_io_mgr->_num_buffers_in_readers;
-    ++_reader->_num_buffers_in_reader;
-    --_reader->_num_ready_buffers;
-    --_reader->_num_used_buffers;
-
-    Status status = (*buffer)->_status;
-    if (!status.ok()) {
-        (*buffer)->return_buffer();
-        *buffer = nullptr;
-        return status;
-    }
-
-    unique_lock<mutex> reader_lock(_reader->_lock);
-    if (_eosr_returned) {
-        _reader->_total_range_queue_capacity += _ready_buffers_capacity;
-        ++_reader->_num_finished_ranges;
-        _reader->_initial_queue_capacity =
-                _reader->_total_range_queue_capacity / _reader->_num_finished_ranges;
-    }
-
-    DCHECK(_reader->validate()) << endl << _reader->debug_string();
-    if (_reader->_state == RequestContext::Cancelled) {
-        _reader->_blocked_ranges.remove(this);
-        cancel(_reader->_status);
-        (*buffer)->return_buffer();
-        *buffer = nullptr;
-        return _status;
-    }
-
-    bool was_blocked = _blocked_on_queue;
-    _blocked_on_queue = _ready_buffers.size() >= _ready_buffers_capacity;
-    if (was_blocked && !_blocked_on_queue && !_eosr_queued) {
-        // This scan range was blocked and is no longer, add it to the reader
-        // queue again.
-        _reader->_blocked_ranges.remove(this);
-        _reader->schedule_scan_range(this);
-    }
-    return Status::OK();
-}
-
-void DiskIoMgr::ScanRange::cancel(const Status& status) {
-    // Cancelling a range that was never started, ignore.
-    if (_io_mgr == nullptr) {
-        return;
-    }
-
-    DCHECK(!status.ok());
-    {
-        // Grab both locks to make sure that all working threads see _is_cancelled.
-        unique_lock<mutex> scan_range_lock(_lock);
-        unique_lock<mutex> hdfs_lock(_hdfs_lock);
-        DCHECK(validate()) << debug_string();
-        if (_is_cancelled) {
-            return;
-        }
-        _is_cancelled = true;
-        _status = status;
-    }
-    _buffer_ready_cv.notify_all();
-    cleanup_queued_buffers();
-
-    // For cached buffers, we can't close the range until the cached buffer is returned.
-    // close() is called from DiskIoMgr::return_buffer().
-    if (_cached_buffer == nullptr) {
-        close();
-    }
-}
-
-void DiskIoMgr::ScanRange::cleanup_queued_buffers() {
-    DCHECK(_is_cancelled);
-    _io_mgr->_num_buffers_in_readers += _ready_buffers.size();
-    _reader->_num_buffers_in_reader += _ready_buffers.size();
-    _reader->_num_used_buffers -= _ready_buffers.size();
-    _reader->_num_ready_buffers -= _ready_buffers.size();
-
-    while (!_ready_buffers.empty()) {
-        BufferDescriptor* buffer = _ready_buffers.front();
-        buffer->return_buffer();
-        _ready_buffers.pop_front();
-    }
-}
-
-string DiskIoMgr::ScanRange::debug_string() const {
-    stringstream ss;
-    ss << "file=" << _file << " disk_id=" << _disk_id << " offset=" << _offset << " len=" << _len
-       << " bytes_read=" << _bytes_read << " buffer_queue=" << _ready_buffers.size()
-       << " capacity=" << _ready_buffers_capacity << " hdfs_file=" << _hdfs_file;
-    return ss.str();
-}
-
-bool DiskIoMgr::ScanRange::validate() {
-    if (_bytes_read > _len) {
-        LOG(WARNING) << "Bytes read tracking is wrong. Shouldn't read past the scan range."
-                     << " _bytes_read=" << _bytes_read << " _len=" << _len;
-        return false;
-    }
-    if (_eosr_returned && !_eosr_queued) {
-        LOG(WARNING) << "Returned eosr to reader before finishing reading the scan range"
-                     << " _eosr_returned=" << _eosr_returned << " _eosr_queued=" << _eosr_queued;
-        return false;
-    }
-    return true;
-}
-
-DiskIoMgr::ScanRange::ScanRange(int capacity) : _ready_buffers_capacity(capacity) {
-    _request_type = RequestType::READ;
-    reset(nullptr, "", -1, -1, -1, false, false, NEVER_CACHE);
-}
-
-DiskIoMgr::ScanRange::~ScanRange() {
-    DCHECK(_hdfs_file == nullptr) << "File was not closed.";
-    DCHECK(_cached_buffer == nullptr) << "Cached buffer was not released.";
-}
-
-void DiskIoMgr::ScanRange::reset(hdfsFS fs, const char* file, int64_t len, int64_t offset,
-                                 int disk_id, bool try_cache, bool expected_local, int64_t mtime,
-                                 void* meta_data) {
-    DCHECK(_ready_buffers.empty());
-    _fs = fs;
-    _file = file;
-    _len = len;
-    _offset = offset;
-    _disk_id = disk_id;
-    _try_cache = try_cache;
-    _expected_local = expected_local;
-    _meta_data = meta_data;
-    _cached_buffer = nullptr;
-    _io_mgr = nullptr;
-    _reader = nullptr;
-    _hdfs_file = nullptr;
-    _mtime = mtime;
-}
-
-void DiskIoMgr::ScanRange::init_internal(DiskIoMgr* io_mgr, RequestContext* reader) {
-    DCHECK(_hdfs_file == nullptr);
-    _io_mgr = io_mgr;
-    _reader = reader;
-    _local_file = nullptr;
-    _hdfs_file = nullptr;
-    _bytes_read = 0;
-    _is_cancelled = false;
-    _eosr_queued = false;
-    _eosr_returned = false;
-    _blocked_on_queue = false;
-    if (_ready_buffers_capacity <= 0) {
-        _ready_buffers_capacity = reader->initial_scan_range_queue_capacity();
-        DCHECK_GE(_ready_buffers_capacity, MIN_QUEUE_CAPACITY);
-    }
-    DCHECK(validate()) << debug_string();
-}
-
-Status DiskIoMgr::ScanRange::open() {
-    unique_lock<mutex> hdfs_lock(_hdfs_lock);
-    if (_is_cancelled) {
-        return Status::Cancelled("Cancelled");
-    }
-
-    // if (_fs != nullptr) {
-    //     if (_hdfs_file != nullptr) {
-    //         return Status::OK();
-    //     }
-    //     _hdfs_file = _io_mgr->OpenHdfsFile(_fs, file(), mtime());
-    //     if (_hdfs_file == nullptr) {
-    //         return Status::InternalError("GetHdfsErrorMsg("Failed to open HDFS file ", _file));
-    //     }
-
-    //     if (hdfsSeek(_fs, _hdfs_file->file(), _offset) != 0) {
-    //         _io_mgr->cache_or_close_file_handle(file(), _hdfs_file, false);
-    //         _hdfs_file = nullptr;
-    //         string error_msg = GetHdfsErrorMsg("");
-    //         stringstream ss;
-    //         ss << "Error seeking to " << _offset << " in file: " << _file << " " << error_msg;
-    //         return Status::InternalError(ss.str());
-    //     }
-    // } else {
-    if (_local_file != nullptr) {
-        return Status::OK();
-    }
-
-    _local_file = fopen(file(), "r");
-    if (_local_file == nullptr) {
-        string error_msg = get_str_err_msg();
-        return Status::InternalError("Could not open file: {}: {}", _file, error_msg);
-    }
-    if (fseek(_local_file, _offset, SEEK_SET) == -1) {
-        fclose(_local_file);
-        _local_file = nullptr;
-        string error_msg = get_str_err_msg();
-        return Status::InternalError("Could not seek to {} for file: {}: {}", _offset, _file,
-                                     error_msg);
-    }
-    // }
-    return Status::OK();
-}
-
-void DiskIoMgr::ScanRange::close() {
-    unique_lock<mutex> hdfs_lock(_hdfs_lock);
-    /*
- *   if (_fs != nullptr) {
- *     if (_hdfs_file == nullptr) return;
- *
- *     struct hdfsReadStatistics* stats;
- *     if (IsDfsPath(file())) {
- *       int success = hdfsFileGetReadStatistics(_hdfs_file->file(), &stats);
- *       if (success == 0) {
- *         _reader->_bytes_read_local += stats->totalLocalBytesRead;
- *         _reader->_bytes_read_short_circuit += stats->totalShortCircuitBytesRead;
- *         _reader->_bytes_read_dn_cache += stats->totalZeroCopyBytesRead;
- *         if (stats->totalLocalBytesRead != stats->totalBytesRead) {
- *           ++_reader->_num_remote_ranges;
- *           if (_expected_local) {
- *             int remote_bytes = stats->totalBytesRead - stats->totalLocalBytesRead;
- *             _reader->_unexpected_remote_bytes += remote_bytes;
- *             VLOG_FILE << "Unexpected remote HDFS read of "
- *                       << PrettyPrinter::Print(remote_bytes, TUnit::BYTES)
- *                       << " for file '" << _file << "'";
- *           }
- *         }
- *         hdfsFileFreeReadStatistics(stats);
- *       }
- *     }
- *     if (_cached_buffer != nullptr) {
- *       hadoopRzBufferFree(_hdfs_file->file(), _cached_buffer);
- *       _cached_buffer = nullptr;
- *     }
- *     _io_mgr->cache_or_close_file_handle(file(), _hdfs_file, false);
- *     VLOG_FILE << "Cache HDFS file handle file=" << file();
- *     _hdfs_file = nullptr;
- *   } else {
- */
-    {
-        if (_local_file == nullptr) {
-            return;
-        }
-        fclose(_local_file);
-        _local_file = nullptr;
-    }
-}
-
-/*
- * int64_t DiskIoMgr::ScanRange::max_read_chunk_size() const {
- *     // S3 InputStreams don't support DIRECT_READ (i.e. java.nio.ByteBuffer read()
- *     // interface).  So, hdfsRead() needs to allocate a Java byte[] and copy the data out.
- *     // Profiles show that both the JNI array allocation and the memcpy adds much more
- *     // overhead for larger buffers, so limit the size of each read request.  128K was
- *     // chosen empirically by trying values between 4K and 8M and optimizing for lower CPU
- *     // utilization and higher S3 throughput.
- *     if (_disk_id == _io_mgr->RemoteS3DiskId()) {
- *         DCHECK(IsS3APath(file()));
- *         return 128 * 1024;
- *     }
- *     return numeric_limits<int64_t>::max();
- * }
- */
-
-// TODO: how do we best use the disk here.  e.g. is it good to break up a
-// 1MB read into 8 128K reads?
-// TODO: look at linux disk scheduling
-Status DiskIoMgr::ScanRange::read(char* buffer, int64_t* bytes_read, bool* eosr) {
-    unique_lock<mutex> hdfs_lock(_hdfs_lock);
-    if (_is_cancelled) {
-        return Status::Cancelled("Cancelled");
-    }
-
-    *eosr = false;
-    *bytes_read = 0;
-    // hdfsRead() length argument is an int.  Since _max_buffer_size type is no bigger
-    // than an int, this min() will ensure that we don't overflow the length argument.
-    DCHECK_LE(sizeof(_io_mgr->_max_buffer_size), sizeof(int));
-    int bytes_to_read =
-            std::min(static_cast<int64_t>(_io_mgr->_max_buffer_size), _len - _bytes_read);
-    DCHECK_GE(bytes_to_read, 0);
-
-    /*
-     * if (_fs != nullptr) {
-     *     DCHECK(_hdfs_file != nullptr);
-     *     int64_t max_chunk_size = max_read_chunk_size();
-     *     while (*bytes_read < bytes_to_read) {
-     *         int chunk_size = min(bytes_to_read - *bytes_read, max_chunk_size);
-     *         int last_read = hdfsRead(_fs, _hdfs_file->file(), buffer + *bytes_read, chunk_size);
-     *         if (last_read == -1) {
-     *             return Status::InternalError("GetHdfsErrorMsg("Error reading from HDFS file: ", _file));
-     *         } else if (last_read == 0) {
-     *             // No more bytes in the file. The scan range went past the end.
-     *             *eosr = true;
-     *             break;
-     *         }
-     *         *bytes_read += last_read;
-     *     }
-     * } else {
-     */
-    DCHECK(_local_file != nullptr);
-    *bytes_read = fread(buffer, 1, bytes_to_read, _local_file);
-    DCHECK_GE(*bytes_read, 0);
-    DCHECK_LE(*bytes_read, bytes_to_read);
-    if (*bytes_read < bytes_to_read) {
-        if (ferror(_local_file) != 0) {
-            string error_msg = get_str_err_msg();
-            return Status::InternalError("Error reading from {} at byte offset: {}: {}", _file,
-                                         (_offset + _bytes_read), error_msg);
-        } else {
-            // On Linux, we should only get partial reads from block devices on error or eof.
-            DCHECK(feof(_local_file) != 0);
-            *eosr = true;
-        }
-    }
-    // }
-    _bytes_read += *bytes_read;
-    DCHECK_LE(_bytes_read, _len);
-    if (_bytes_read == _len) {
-        *eosr = true;
-    }
-    return Status::OK();
-}
-
-/*
- * Status DiskIoMgr::ScanRange::read_from_cache(bool* read_succeeded) {
- *   DCHECK(_try_cache);
- *   DCHECK_EQ(_bytes_read, 0);
- *   *read_succeeded = false;
- *   Status status = open();
- *   if (!status.ok()) return status;
- *
- *   // Cached reads not supported on local filesystem.
- *   if (_fs == nullptr) return Status::OK();
- *
- *   {
- *     unique_lock<mutex> hdfs_lock(_hdfs_lock);
- *     if (_is_cancelled) return Status::Cancelled("Cancelled");
- *
- *     DCHECK(_hdfs_file != nullptr);
- *     DCHECK(_cached_buffer == nullptr);
- *     _cached_buffer = hadoopReadZero(_hdfs_file->file(),
- *         _io_mgr->_cached_read_options, len());
- *
- *     // Data was not cached, caller will fall back to normal read path.
- *     if (_cached_buffer == nullptr) return Status::OK();
- *   }
- *
- *   // Cached read succeeded.
- *   void* buffer = const_cast<void*>(hadoopRzBufferGet(_cached_buffer));
- *   int32_t bytes_read = hadoopRzBufferLength(_cached_buffer);
- *   // For now, entire the entire block is cached or none of it.
- *   // TODO: if HDFS ever changes this, we'll have to handle the case where half
- *   // the block is cached.
- *   DCHECK_EQ(bytes_read, len());
- *
- *   // Create a single buffer desc for the entire scan range and enqueue that.
- *   BufferDescriptor* desc = _io_mgr->get_buffer_desc(
- *       _reader, this, reinterpret_cast<char*>(buffer), 0);
- *   desc->_len = bytes_read;
- *   desc->_scan_range_offset = 0;
- *   desc->_eosr = true;
- *   _bytes_read = bytes_read;
- *   enqueue_buffer(desc);
- *   if (_reader->_bytes_read_counter != nullptr) {
- *     COUNTER_ADD(_reader->_bytes_read_counter, bytes_read);
- *   }
- *   *read_succeeded = true;
- *   ++_reader->_num_used_buffers;
- *   return Status::OK();
- * }
- */
-} // namespace doris
diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h
index a4ebf52be6..c47d15d9fa 100644
--- a/be/src/runtime/exec_env.h
+++ b/be/src/runtime/exec_env.h
@@ -40,11 +40,8 @@ class BrokerMgr;
 template <class T>
 class BrpcClientCache;
 
-class BufferPool;
 class CgroupsMgr;
 class DataStreamMgr;
-class DiskIoMgr;
-class EtlJobMgr;
 class EvHttpServer;
 class ExternalScanContextMgr;
 class FragmentMgr;
@@ -153,7 +150,6 @@ public:
     ResultCache* result_cache() { return _result_cache; }
     TMasterInfo* master_info() { return _master_info; }
     LoadPathMgr* load_path_mgr() { return _load_path_mgr; }
-    DiskIoMgr* disk_io_mgr() { return _disk_io_mgr; }
     TmpFileMgr* tmp_file_mgr() { return _tmp_file_mgr; }
     BfdParser* bfd_parser() const { return _bfd_parser; }
     BrokerMgr* broker_mgr() const { return _broker_mgr; }
@@ -163,7 +159,6 @@ public:
     BrpcClientCache<PFunctionService_Stub>* brpc_function_client_cache() const {
         return _function_client_cache;
     }
-    BufferPool* buffer_pool() { return _buffer_pool; }
     LoadChannelMgr* load_channel_mgr() { return _load_channel_mgr; }
     LoadStreamMgr* load_stream_mgr() { return _load_stream_mgr; }
     NewLoadStreamMgr* new_load_stream_mgr() { return _new_load_stream_mgr; }
@@ -195,8 +190,6 @@ private:
     void _destroy();
 
     Status _init_mem_env();
-    /// Initialise 'buffer_pool_' with given capacity.
-    void _init_buffer_pool(int64_t min_page_len, int64_t capacity, int64_t clean_pages_limit);
 
     void _register_metrics();
     void _deregister_metrics();
@@ -237,7 +230,6 @@ private:
     ResultCache* _result_cache = nullptr;
     TMasterInfo* _master_info = nullptr;
     LoadPathMgr* _load_path_mgr = nullptr;
-    DiskIoMgr* _disk_io_mgr = nullptr;
     TmpFileMgr* _tmp_file_mgr = nullptr;
 
     BfdParser* _bfd_parser = nullptr;
@@ -248,8 +240,6 @@ private:
     BrpcClientCache<PBackendService_Stub>* _internal_client_cache = nullptr;
     BrpcClientCache<PFunctionService_Stub>* _function_client_cache = nullptr;
 
-    BufferPool* _buffer_pool = nullptr;
-
     StorageEngine* _storage_engine = nullptr;
 
     StreamLoadExecutor* _stream_load_executor = nullptr;
diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp
index fcc886bb25..22fa7f9123 100644
--- a/be/src/runtime/exec_env_init.cpp
+++ b/be/src/runtime/exec_env_init.cpp
@@ -28,10 +28,8 @@
 #include "pipeline/task_scheduler.h"
 #include "runtime/block_spill_manager.h"
 #include "runtime/broker_mgr.h"
-#include "runtime/bufferpool/buffer_pool.h"
 #include "runtime/cache/result_cache.h"
 #include "runtime/client_cache.h"
-#include "runtime/disk_io_mgr.h"
 #include "runtime/exec_env.h"
 #include "runtime/external_scan_context_mgr.h"
 #include "runtime/fold_constant_executor.h"
@@ -115,7 +113,6 @@ Status ExecEnv::_init(const std::vector<StorePath>& store_paths) {
                                     config::query_cache_elasticity_size_mb);
     _master_info = new TMasterInfo();
     _load_path_mgr = new LoadPathMgr(this);
-    _disk_io_mgr = new DiskIoMgr();
     _tmp_file_mgr = new TmpFileMgr(this);
     _bfd_parser = BfdParser::create();
     _broker_mgr = new BrokerMgr(this);
@@ -186,48 +183,11 @@ Status ExecEnv::_init_mem_env() {
         return Status::InternalError(ss.str());
     }
 
-    int64_t buffer_pool_limit = ParseUtil::parse_mem_spec(
-            config::buffer_pool_limit, MemInfo::mem_limit(), MemInfo::physical_mem(), &is_percent);
-    if (buffer_pool_limit <= 0) {
-        ss << "Invalid config buffer_pool_limit value, must be a percentage or "
-              "positive bytes value or percentage: "
-           << config::buffer_pool_limit;
-        return Status::InternalError(ss.str());
-    }
-    buffer_pool_limit = BitUtil::RoundDown(buffer_pool_limit, config::min_buffer_size);
-    while (!is_percent && buffer_pool_limit > MemInfo::mem_limit() / 2) {
-        // If buffer_pool_limit is not a percentage, and the value exceeds 50% of the total memory limit,
-        // it is forced to be reduced to less than 50% of the total memory limit.
-        // This is to ensure compatibility. In principle, buffer_pool_limit should be set as a percentage.
-        buffer_pool_limit = buffer_pool_limit / 2;
-    }
-
-    int64_t clean_pages_limit =
-            ParseUtil::parse_mem_spec(config::buffer_pool_clean_pages_limit, buffer_pool_limit,
-                                      MemInfo::physical_mem(), &is_percent);
-    if (clean_pages_limit <= 0) {
-        ss << "Invalid buffer_pool_clean_pages_limit value, must be a percentage or "
-              "positive bytes value or percentage: "
-           << config::buffer_pool_clean_pages_limit;
-        return Status::InternalError(ss.str());
-    }
-    while (!is_percent && clean_pages_limit > buffer_pool_limit / 2) {
-        // Reason same as buffer_pool_limit
-        clean_pages_limit = clean_pages_limit / 2;
-    }
-    _init_buffer_pool(config::min_buffer_size, buffer_pool_limit, clean_pages_limit);
-    LOG(INFO) << "Buffer pool memory limit: "
-              << PrettyPrinter::print(buffer_pool_limit, TUnit::BYTES)
-              << ", origin config value: " << config::buffer_pool_limit
-              << ". clean pages limit: " << PrettyPrinter::print(clean_pages_limit, TUnit::BYTES)
-              << ", origin config value: " << config::buffer_pool_clean_pages_limit;
-
     // 3. init storage page cache
     int64_t storage_cache_limit =
             ParseUtil::parse_mem_spec(config::storage_page_cache_limit, MemInfo::mem_limit(),
                                       MemInfo::physical_mem(), &is_percent);
     while (!is_percent && storage_cache_limit > MemInfo::mem_limit() / 2) {
-        // Reason same as buffer_pool_limit
         storage_cache_limit = storage_cache_limit / 2;
     }
     int32_t index_percentage = config::index_page_cache_percentage;
@@ -254,7 +214,6 @@ Status ExecEnv::_init_mem_env() {
     SegmentLoader::create_global_instance(segment_cache_capacity);
 
     // 4. init other managers
-    RETURN_IF_ERROR(_disk_io_mgr->init(MemInfo::mem_limit()));
     RETURN_IF_ERROR(_tmp_file_mgr->init());
     RETURN_IF_ERROR(_block_spill_mgr->init());
 
@@ -277,12 +236,6 @@ Status ExecEnv::_init_mem_env() {
     return Status::OK();
 }
 
-void ExecEnv::_init_buffer_pool(int64_t min_page_size, int64_t capacity,
-                                int64_t clean_pages_limit) {
-    DCHECK(_buffer_pool == nullptr);
-    _buffer_pool = new BufferPool(min_page_size, capacity, clean_pages_limit);
-}
-
 void ExecEnv::init_download_cache_buf() {
     std::unique_ptr<char[]> download_cache_buf(new char[config::download_cache_buffer_size]);
     memset(download_cache_buf.get(), 0, config::download_cache_buffer_size);
@@ -335,7 +288,6 @@ void ExecEnv::_destroy() {
     SAFE_DELETE(_broker_mgr);
     SAFE_DELETE(_bfd_parser);
     SAFE_DELETE(_tmp_file_mgr);
-    SAFE_DELETE(_disk_io_mgr);
     SAFE_DELETE(_load_path_mgr);
     SAFE_DELETE(_master_info);
     SAFE_DELETE(_fragment_mgr);
diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h
index 20a25b1493..ee22760ab1 100644
--- a/be/src/runtime/runtime_state.h
+++ b/be/src/runtime/runtime_state.h
@@ -43,7 +43,6 @@ class DateTimeValue;
 class MemTracker;
 class DataStreamRecvr;
 class ResultBufferMgr;
-class DiskIoMgrs;
 class TmpFileMgr;
 class BufferedBlockMgr;
 class BufferedBlockMgr2;
diff --git a/be/src/util/filesystem_util.h b/be/src/util/filesystem_util.h
index ee5adbcf99..ac29295e3e 100644
--- a/be/src/util/filesystem_util.h
+++ b/be/src/util/filesystem_util.h
@@ -25,8 +25,8 @@
 namespace doris {
 
 // Utility class for common local file system operations such as file creation and
-// deletion. This class should NOT be used to read or write data (DiskIoMgr is used
-// for that). Errors are indicated by the status code RUNTIME_ERROR, and are not
+// deletion. This class should NOT be used to read or write data
+// Errors are indicated by the status code RUNTIME_ERROR, and are not
 // handled via exceptions.
 class FileSystemUtil {
 public:
diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt
index 6c59e91d46..28f41c081f 100644
--- a/be/test/CMakeLists.txt
+++ b/be/test/CMakeLists.txt
@@ -159,7 +159,6 @@ set(RUNTIME_TEST_FILES
     # runtime/dpp_sink_test.cpp
     # runtime/data_spliter_test.cpp
     # runtime/tmp_file_mgr_test.cpp
-    # runtime/disk_io_mgr_test.cpp
     # runtime/thread_resource_mgr_test.cpp
     # runtime/export_task_mgr_test.cpp
     runtime/mem_pool_test.cpp
diff --git a/be/test/runtime/disk_io_mgr_test.cpp b/be/test/runtime/disk_io_mgr_test.cpp
deleted file mode 100644
index 4b5666a240..0000000000
--- a/be/test/runtime/disk_io_mgr_test.cpp
+++ /dev/null
@@ -1,1069 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/disk_io_mgr.h"
-
-#include <gtest/gtest.h>
-#include <sched.h>
-#include <sys/stat.h>
-
-#include <functional>
-#include <thread>
-
-#include "util/cpu_info.h"
-#include "util/disk_info.h"
-#include "util/thread_group.h"
-
-using std::string;
-using std::stringstream;
-using std::vector;
-using std::list;
-
-using std::lock_guard;
-using std::unique_lock;
-using std::mutex;
-using std::mem_fn;
-using std::condition_variable;
-using std::unique_ptr;
-using std::thread;
-
-namespace doris {
-
-const int MIN_BUFFER_SIZE = 512;
-const int MAX_BUFFER_SIZE = 1024;
-const int LARGE_MEM_LIMIT = 1024 * 1024 * 1024;
-
-class DiskIoMgrTest : public testing::Test {
-public:
-    void write_validate_callback(int num_writes, DiskIoMgr::WriteRange** written_range,
-                                 DiskIoMgr* io_mgr, DiskIoMgr::RequestContext* reader,
-                                 int32_t* data, Status expected_status, const Status& status) {
-        if (expected_status.code() == E_CANCELLED) {
-            EXPECT_TRUE(status.ok() || status.is<E_CANCELLED>());
-        } else {
-            EXPECT_TRUE(status.code() == expected_status.code());
-        }
-        if (status.ok()) {
-            DiskIoMgr::ScanRange* scan_range = _pool->add(new DiskIoMgr::ScanRange());
-            scan_range->reset(nullptr, (*written_range)->file(), (*written_range)->len(),
-                              (*written_range)->offset(), 0, false, false,
-                              DiskIoMgr::ScanRange::NEVER_CACHE);
-            validate_sync_read(io_mgr, reader, scan_range, reinterpret_cast<const char*>(data),
-                               sizeof(int32_t));
-        }
-
-        {
-            lock_guard<mutex> l(_written_mutex);
-            ++_num_ranges_written;
-            if (_num_ranges_written == num_writes) {
-                _writes_done.notify_one();
-            }
-        }
-    }
-
-    void write_complete_callback(int num_writes, const Status& status) {
-        EXPECT_TRUE(status.ok());
-        {
-            lock_guard<mutex> l(_written_mutex);
-            ++_num_ranges_written;
-            if (_num_ranges_written == num_writes) {
-                _writes_done.notify_all();
-            }
-        }
-    }
-
-protected:
-    void CreateTempFile(const char* filename, const char* data) {
-        FILE* file = fopen(filename, "w");
-        EXPECT_TRUE(file != nullptr);
-        fwrite(data, 1, strlen(data), file);
-        fclose(file);
-    }
-
-    int CreateTempFile(const char* filename, int file_size) {
-        FILE* file = fopen(filename, "w");
-        EXPECT_TRUE(file != nullptr);
-        int success = fclose(file);
-        if (success != 0) {
-            LOG(ERROR) << "Error closing file " << filename;
-            return success;
-        }
-        return truncate(filename, file_size);
-    }
-
-    // Validates that buffer[i] is \0 or expected[i]
-    static void validate_empty_or_correct(const char* expected, const char* buffer, int len) {
-        for (int i = 0; i < len; ++i) {
-            if (buffer[i] != '\0') {
-                EXPECT_EQ(expected[i], buffer[i]) << (int)expected[i] << " != " << (int)buffer[i];
-            }
-        }
-    }
-
-    static void validate_sync_read(DiskIoMgr* io_mgr, DiskIoMgr::RequestContext* reader,
-                                   DiskIoMgr::ScanRange* range, const char* expected,
-                                   int expected_len = -1) {
-        DiskIoMgr::BufferDescriptor* buffer;
-        Status status = io_mgr->read(reader, range, &buffer);
-        EXPECT_TRUE(status.ok());
-        EXPECT_TRUE(buffer != nullptr);
-        EXPECT_EQ(buffer->len(), range->len());
-        if (expected_len < 0) {
-            expected_len = strlen(expected);
-        }
-        int cmp = memcmp(buffer->buffer(), expected, expected_len);
-        EXPECT_TRUE(cmp == 0);
-        buffer->return_buffer();
-    }
-
-    static void validate_scan_range(DiskIoMgr::ScanRange* range, const char* expected,
-                                    int expected_len, const Status& expected_status) {
-        char result[expected_len + 1];
-        memset(result, 0, expected_len + 1);
-
-        while (true) {
-            DiskIoMgr::BufferDescriptor* buffer = nullptr;
-            Status status = range->get_next(&buffer);
-            EXPECT_TRUE(status.ok() || status.code() == expected_status.code());
-            if (buffer == nullptr || !status.ok()) {
-                if (buffer != nullptr) buffer->return_buffer();
-                break;
-            }
-            EXPECT_LE(buffer->len(), expected_len);
-            memcpy(result + range->offset() + buffer->scan_range_offset(), buffer->buffer(),
-                   buffer->len());
-            buffer->return_buffer();
-        }
-        validate_empty_or_correct(expected, result, expected_len);
-    }
-
-    // Continues pulling scan ranges from the io mgr until they are all done.
-    // Updates num_ranges_processed with the number of ranges seen by this thread.
-    static void scan_range_thread(DiskIoMgr* io_mgr, DiskIoMgr::RequestContext* reader,
-                                  const char* expected_result, int expected_len,
-                                  const Status& expected_status, int max_ranges,
-                                  std::atomic<int>* num_ranges_processed) {
-        int num_ranges = 0;
-        while (max_ranges == 0 || num_ranges < max_ranges) {
-            DiskIoMgr::ScanRange* range;
-            Status status = io_mgr->get_next_range(reader, &range);
-            EXPECT_TRUE(status.ok() || status.code() == expected_status.code());
-            if (range == nullptr) break;
-            validate_scan_range(range, expected_result, expected_len, expected_status);
-            ++(*num_ranges_processed);
-            ++num_ranges;
-        }
-    }
-
-    DiskIoMgr::ScanRange* init_range(int num_buffers, const char* file_path, int offset, int len,
-                                     int disk_id, int64_t mtime, void* meta_data = nullptr,
-                                     bool is_cached = false) {
-        DiskIoMgr::ScanRange* range = _pool->add(new DiskIoMgr::ScanRange(num_buffers));
-        range->reset(nullptr, file_path, len, offset, disk_id, is_cached, true, mtime, meta_data);
-        EXPECT_EQ(mtime, range->mtime());
-        return range;
-    }
-
-    std::unique_ptr<ObjectPool> _pool;
-
-    mutex _written_mutex;
-    condition_variable _writes_done;
-    int _num_ranges_written;
-};
-
-// Test a single writer with multiple disks and threads per disk. Each WriteRange
-// writes random 4-byte integers, and upon completion, the written data is validated
-// by reading the data back via a separate IoMgr instance. All writes are expected to
-// complete successfully.
-TEST_F(DiskIoMgrTest, SingleWriter) {
-    _num_ranges_written = 0;
-    string tmp_file = "/tmp/disk_io_mgr_test.txt";
-    int num_ranges = 100;
-    int64_t file_size = 1024 * 1024;
-    int64_t cur_offset = 0;
-    int success = CreateTempFile(tmp_file.c_str(), file_size);
-    if (success != 0) {
-        LOG(ERROR) << "Error creating temp file " << tmp_file.c_str() << " of size " << file_size;
-        EXPECT_TRUE(false);
-    }
-
-    std::unique_ptr<DiskIoMgr> read_io_mgr(new DiskIoMgr(1, 1, 1, 10));
-    Status status = read_io_mgr->init(LARGE_MEM_LIMIT);
-    EXPECT_TRUE(status.ok());
-    DiskIoMgr::RequestContext* reader;
-    status = read_io_mgr->register_context(&reader);
-    EXPECT_TRUE(status.ok());
-    for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
-        for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-            _pool.reset(new ObjectPool);
-            DiskIoMgr io_mgr(num_disks, num_threads_per_disk, 1, 10);
-            status = io_mgr.init(LARGE_MEM_LIMIT);
-            EXPECT_TRUE(status.ok());
-            DiskIoMgr::RequestContext* writer;
-            io_mgr.register_context(&writer);
-            for (int i = 0; i < num_ranges; ++i) {
-                int32_t* data = _pool->add(new int32_t);
-                *data = rand();
-                DiskIoMgr::WriteRange** new_range = _pool->add(new DiskIoMgr::WriteRange*);
-                DiskIoMgr::WriteRange::WriteDoneCallback callback =
-                        bind(mem_fn(&DiskIoMgrTest::write_validate_callback), this, num_ranges,
-                             new_range, read_io_mgr.get(), reader, data, Status::OK(), _1);
-                *new_range = _pool->add(new DiskIoMgr::WriteRange(
-                        tmp_file, cur_offset, num_ranges % num_disks, callback));
-                (*new_range)->set_data(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
-                Status add_status = io_mgr.add_write_range(writer, *new_range);
-                EXPECT_TRUE(add_status.ok());
-                cur_offset += sizeof(int32_t);
-            }
-
-            {
-                unique_lock<mutex> lock(_written_mutex);
-                while (_num_ranges_written < num_ranges) {
-                    _writes_done.wait(lock);
-                }
-            }
-            _num_ranges_written = 0;
-            io_mgr.unregister_context(writer);
-        }
-    }
-
-    read_io_mgr->unregister_context(reader);
-    read_io_mgr.reset();
-}
-
-// Perform invalid writes (e.g. non-existent file, negative offset) and validate
-// that an error status is returned via the write callback.
-TEST_F(DiskIoMgrTest, InvalidWrite) {
-    _num_ranges_written = 0;
-    string tmp_file = "/tmp/non-existent.txt";
-    DiskIoMgr io_mgr(1, 1, 1, 10);
-    Status status = io_mgr.init(LARGE_MEM_LIMIT);
-    EXPECT_TRUE(status.ok());
-    DiskIoMgr::RequestContext* writer;
-    status = io_mgr.register_context(&writer);
-    _pool.reset(new ObjectPool);
-    int32_t* data = _pool->add(new int32_t);
-    *data = rand();
-
-    // Write to a non-existent file.
-    DiskIoMgr::WriteRange** new_range = _pool->add(new DiskIoMgr::WriteRange*);
-    DiskIoMgr::WriteRange::WriteDoneCallback callback =
-            bind(mem_fn(&DiskIoMgrTest::write_validate_callback), this, 2, new_range,
-                 (DiskIoMgr*)nullptr, (DiskIoMgr::RequestContext*)nullptr, data,
-                 Status::InternalError("Test Failure"), _1);
-    *new_range = _pool->add(new DiskIoMgr::WriteRange(tmp_file, rand(), 0, callback));
-
-    (*new_range)->set_data(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
-    status = io_mgr.add_write_range(writer, *new_range);
-    EXPECT_TRUE(status.ok());
-
-    // Write to a bad location in a file that exists.
-    tmp_file = "/tmp/disk_io_mgr_test.txt";
-    int success = CreateTempFile(tmp_file.c_str(), 100);
-    if (success != 0) {
-        LOG(ERROR) << "Error creating temp file " << tmp_file.c_str() << " of size 100";
-        EXPECT_TRUE(false);
-    }
-
-    new_range = _pool->add(new DiskIoMgr::WriteRange*);
-    callback = bind(mem_fn(&DiskIoMgrTest::write_validate_callback), this, 2, new_range,
-                    (DiskIoMgr*)nullptr, (DiskIoMgr::RequestContext*)nullptr, data,
-                    Status::InternalError("Test Failure"), _1);
-
-    *new_range = _pool->add(new DiskIoMgr::WriteRange(tmp_file, -1, 0, callback));
-    (*new_range)->set_data(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
-    status = io_mgr.add_write_range(writer, *new_range);
-    EXPECT_TRUE(status.ok());
-
-    {
-        unique_lock<mutex> lock(_written_mutex);
-        while (_num_ranges_written < 2) {
-            _writes_done.wait(lock);
-        }
-    }
-    _num_ranges_written = 0;
-    io_mgr.unregister_context(writer);
-}
-
-// Issue a number of writes, cancel the writer context and issue more writes.
-// add_write_range() is expected to succeed before the cancel and fail after it.
-// The writes themselves may finish with status cancelled or ok.
-TEST_F(DiskIoMgrTest, SingleWriterCancel) {
-    _num_ranges_written = 0;
-    string tmp_file = "/tmp/disk_io_mgr_test.txt";
-    int num_ranges = 100;
-    int num_ranges_before_cancel = 25;
-    int64_t file_size = 1024 * 1024;
-    int64_t cur_offset = 0;
-    int success = CreateTempFile(tmp_file.c_str(), file_size);
-    if (success != 0) {
-        LOG(ERROR) << "Error creating temp file " << tmp_file.c_str() << " of size " << file_size;
-        EXPECT_TRUE(false);
-    }
-
-    std::unique_ptr<DiskIoMgr> read_io_mgr(new DiskIoMgr(1, 1, 1, 10));
-    Status status = read_io_mgr->init(LARGE_MEM_LIMIT);
-    EXPECT_TRUE(status.ok());
-    DiskIoMgr::RequestContext* reader;
-    status = read_io_mgr->register_context(&reader);
-    EXPECT_TRUE(status.ok());
-    for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
-        for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-            _pool.reset(new ObjectPool);
-            DiskIoMgr io_mgr(num_disks, num_threads_per_disk, 1, 10);
-            status = io_mgr.init(LARGE_MEM_LIMIT);
-            DiskIoMgr::RequestContext* writer;
-            io_mgr.register_context(&writer);
-            Status validate_status = Status::OK();
-            for (int i = 0; i < num_ranges; ++i) {
-                if (i == num_ranges_before_cancel) {
-                    io_mgr.cancel_context(writer);
-                    validate_status = Status::Cancelled("");
-                }
-                int32_t* data = _pool->add(new int32_t);
-                *data = rand();
-                DiskIoMgr::WriteRange** new_range = _pool->add(new DiskIoMgr::WriteRange*);
-                DiskIoMgr::WriteRange::WriteDoneCallback callback =
-                        bind(mem_fn(&DiskIoMgrTest::write_validate_callback), this,
-                             num_ranges_before_cancel, new_range, read_io_mgr.get(), reader, data,
-                             Status::Cancelled(""), _1);
-                *new_range = _pool->add(new DiskIoMgr::WriteRange(
-                        tmp_file, cur_offset, num_ranges % num_disks, callback));
-                (*new_range)->set_data(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
-                cur_offset += sizeof(int32_t);
-                Status add_status = io_mgr.add_write_range(writer, *new_range);
-                EXPECT_TRUE(add_status.code() == validate_status.code());
-            }
-
-            {
-                unique_lock<mutex> lock(_written_mutex);
-                while (_num_ranges_written < num_ranges_before_cancel) {
-                    _writes_done.wait(lock);
-                }
-            }
-            _num_ranges_written = 0;
-            io_mgr.unregister_context(writer);
-        }
-    }
-
-    read_io_mgr->unregister_context(reader);
-    read_io_mgr.reset();
-}
-
-// Basic test with a single reader, testing multiple threads, disks and a different
-// number of buffers.
-TEST_F(DiskIoMgrTest, SingleReader) {
-    const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
-    const char* data = "abcdefghijklm";
-    int len = strlen(data);
-    CreateTempFile(tmp_file, data);
-
-    // Get mtime for file
-    struct stat stat_val;
-    stat(tmp_file, &stat_val);
-
-    int64_t iters = 0;
-    for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
-        for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-            for (int num_buffers = 1; num_buffers <= 5; ++num_buffers) {
-                for (int num_read_threads = 1; num_read_threads <= 5; ++num_read_threads) {
-                    _pool.reset(new ObjectPool);
-                    LOG(INFO) << "Starting test with num_threads_per_disk=" << num_threads_per_disk
-                              << " num_disk=" << num_disks << " num_buffers=" << num_buffers
-                              << " num_read_threads=" << num_read_threads;
-
-                    if (++iters % 5000 == 0) {
-                        LOG(ERROR) << "Starting iteration " << iters;
-                    }
-                    DiskIoMgr io_mgr(num_disks, num_threads_per_disk, 1, 1);
-
-                    Status status = io_mgr.init(LARGE_MEM_LIMIT);
-                    EXPECT_TRUE(status.ok());
-                    DiskIoMgr::RequestContext* reader;
-                    status = io_mgr.register_context(&reader);
-                    EXPECT_TRUE(status.ok());
-
-                    std::vector<DiskIoMgr::ScanRange*> ranges;
-                    for (int i = 0; i < len; ++i) {
-                        int disk_id = i % num_disks;
-                        ranges.push_back(init_range(num_buffers, tmp_file, 0, len, disk_id,
-                                                    stat_val.st_mtime));
-                    }
-                    status = io_mgr.add_scan_ranges(reader, ranges);
-                    EXPECT_TRUE(status.ok());
-
-                    std::atomic<int> num_ranges_processed;
-                    ThreadGroup threads;
-                    for (int i = 0; i < num_read_threads; ++i) {
-                        threads.add_thread(new thread(scan_range_thread, &io_mgr, reader, data, len,
-                                                      Status::OK(), 0, &num_ranges_processed));
-                    }
-                    threads.join_all();
-
-                    EXPECT_EQ(num_ranges_processed, ranges.size());
-                    io_mgr.unregister_context(reader);
-                }
-            }
-        }
-    }
-    EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-
-// This test issues adding additional scan ranges while there are some still in flight.
-TEST_F(DiskIoMgrTest, AddScanRangeTest) {
-    const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
-    const char* data = "abcdefghijklm";
-    int len = strlen(data);
-    CreateTempFile(tmp_file, data);
-
-    // Get mtime for file
-    struct stat stat_val;
-    stat(tmp_file, &stat_val);
-
-    int64_t iters = 0;
-    for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
-        for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-            for (int num_buffers = 1; num_buffers <= 5; ++num_buffers) {
-                _pool.reset(new ObjectPool);
-                LOG(INFO) << "Starting test with num_threads_per_disk=" << num_threads_per_disk
-                          << " num_disk=" << num_disks << " num_buffers=" << num_buffers;
-
-                if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
-                DiskIoMgr io_mgr(num_disks, num_threads_per_disk, 1, 1);
-
-                Status status = io_mgr.init(LARGE_MEM_LIMIT);
-                EXPECT_TRUE(status.ok());
-                DiskIoMgr::RequestContext* reader;
-                status = io_mgr.register_context(&reader);
-                EXPECT_TRUE(status.ok());
-
-                std::vector<DiskIoMgr::ScanRange*> ranges_first_half;
-                std::vector<DiskIoMgr::ScanRange*> ranges_second_half;
-                for (int i = 0; i < len; ++i) {
-                    int disk_id = i % num_disks;
-                    if (i > len / 2) {
-                        ranges_second_half.push_back(init_range(num_buffers, tmp_file, i, 1,
-                                                                disk_id, stat_val.st_mtime));
-                    } else {
-                        ranges_first_half.push_back(init_range(num_buffers, tmp_file, i, 1, disk_id,
-                                                               stat_val.st_mtime));
-                    }
-                }
-                std::atomic<int> num_ranges_processed;
-
-                // Issue first half the scan ranges.
-                status = io_mgr.add_scan_ranges(reader, ranges_first_half);
-                EXPECT_TRUE(status.ok());
-
-                // Read a couple of them
-                scan_range_thread(&io_mgr, reader, data, strlen(data), Status::OK(), 2,
-                                  &num_ranges_processed);
-
-                // Issue second half
-                status = io_mgr.add_scan_ranges(reader, ranges_second_half);
-                EXPECT_TRUE(status.ok());
-
-                // Start up some threads and then cancel
-                ThreadGroup threads;
-                for (int i = 0; i < 3; ++i) {
-                    threads.add_thread(new thread(scan_range_thread, &io_mgr, reader, data,
-                                                  strlen(data), Status::Cancelled(""), 0,
-                                                  &num_ranges_processed));
-                }
-
-                threads.join_all();
-                EXPECT_EQ(num_ranges_processed, len);
-                io_mgr.unregister_context(reader);
-            }
-        }
-    }
-    EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-
-// Test to make sure that sync reads and async reads work together
-// Note: this test is constructed so the number of buffers is greater than the
-// number of scan ranges.
-TEST_F(DiskIoMgrTest, SyncReadTest) {
-    const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
-    const char* data = "abcdefghijklm";
-    int len = strlen(data);
-    CreateTempFile(tmp_file, data);
-
-    // Get mtime for file
-    struct stat stat_val;
-    stat(tmp_file, &stat_val);
-
-    int64_t iters = 0;
-    for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
-        for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-            for (int num_buffers = 1; num_buffers <= 5; ++num_buffers) {
-                _pool.reset(new ObjectPool);
-                LOG(INFO) << "Starting SyncReadTest test with num_threads_per_disk="
-                          << num_threads_per_disk << " num_disk=" << num_disks
-                          << " num_buffers=" << num_buffers;
-
-                if (++iters % 5000 == 0) {
-                    LOG(ERROR) << "Starting iteration " << iters;
-                }
-                DiskIoMgr io_mgr(num_disks, num_threads_per_disk, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
-
-                Status status = io_mgr.init(LARGE_MEM_LIMIT);
-                EXPECT_TRUE(status.ok());
-                DiskIoMgr::RequestContext* reader;
-                status = io_mgr.register_context(&reader);
-                EXPECT_TRUE(status.ok());
-
-                DiskIoMgr::ScanRange* complete_range =
-                        init_range(1, tmp_file, 0, strlen(data), 0, stat_val.st_mtime);
-
-                // Issue some reads before the async ones are issued
-                validate_sync_read(&io_mgr, reader, complete_range, data);
-                validate_sync_read(&io_mgr, reader, complete_range, data);
-
-                std::vector<DiskIoMgr::ScanRange*> ranges;
-                for (int i = 0; i < len; ++i) {
-                    int disk_id = i % num_disks;
-                    ranges.push_back(
-                            init_range(num_buffers, tmp_file, 0, len, disk_id, stat_val.st_mtime));
-                }
-                status = io_mgr.add_scan_ranges(reader, ranges);
-                EXPECT_TRUE(status.ok());
-
-                std::atomic<int> num_ranges_processed;
-                ThreadGroup threads;
-                for (int i = 0; i < 5; ++i) {
-                    threads.add_thread(new thread(scan_range_thread, &io_mgr, reader, data,
-                                                  strlen(data), Status::OK(), 0,
-                                                  &num_ranges_processed));
-                }
-
-                // Issue some more sync ranges
-                for (int i = 0; i < 5; ++i) {
-                    sched_yield();
-                    validate_sync_read(&io_mgr, reader, complete_range, data);
-                }
-
-                threads.join_all();
-
-                validate_sync_read(&io_mgr, reader, complete_range, data);
-                validate_sync_read(&io_mgr, reader, complete_range, data);
-
-                EXPECT_EQ(num_ranges_processed, ranges.size());
-                io_mgr.unregister_context(reader);
-            }
-        }
-    }
-    EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-
-// Tests a single reader cancelling half way through scan ranges.
-TEST_F(DiskIoMgrTest, SingleReaderCancel) {
-    const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
-    const char* data = "abcdefghijklm";
-    int len = strlen(data);
-    CreateTempFile(tmp_file, data);
-
-    // Get mtime for file
-    struct stat stat_val;
-    stat(tmp_file, &stat_val);
-
-    int64_t iters = 0;
-    for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
-        for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-            for (int num_buffers = 1; num_buffers <= 5; ++num_buffers) {
-                _pool.reset(new ObjectPool);
-                LOG(INFO) << "Starting test with num_threads_per_disk=" << num_threads_per_disk
-                          << " num_disk=" << num_disks << " num_buffers=" << num_buffers;
-
-                if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
-                DiskIoMgr io_mgr(num_disks, num_threads_per_disk, 1, 1);
-
-                Status status = io_mgr.init(LARGE_MEM_LIMIT);
-                EXPECT_TRUE(status.ok());
-                DiskIoMgr::RequestContext* reader;
-                status = io_mgr.register_context(&reader);
-                EXPECT_TRUE(status.ok());
-
-                std::vector<DiskIoMgr::ScanRange*> ranges;
-                for (int i = 0; i < len; ++i) {
-                    int disk_id = i % num_disks;
-                    ranges.push_back(
-                            init_range(num_buffers, tmp_file, 0, len, disk_id, stat_val.st_mtime));
-                }
-                status = io_mgr.add_scan_ranges(reader, ranges);
-                EXPECT_TRUE(status.ok());
-
-                std::atomic<int> num_ranges_processed;
-                int num_successful_ranges = ranges.size() / 2;
-                // Read half the ranges
-                for (int i = 0; i < num_successful_ranges; ++i) {
-                    scan_range_thread(&io_mgr, reader, data, strlen(data), Status::OK(), 1,
-                                      &num_ranges_processed);
-                }
-                EXPECT_EQ(num_ranges_processed, num_successful_ranges);
-
-                // Start up some threads and then cancel
-                ThreadGroup threads;
-                for (int i = 0; i < 3; ++i) {
-                    threads.add_thread(new thread(scan_range_thread, &io_mgr, reader, data,
-                                                  strlen(data), Status::Cancelled(""), 0,
-                                                  &num_ranges_processed));
-                }
-
-                io_mgr.cancel_context(reader);
-                sched_yield();
-
-                threads.join_all();
-                EXPECT_TRUE(io_mgr.context_status(reader).is_cancelled());
-                io_mgr.unregister_context(reader);
-            }
-        }
-    }
-    EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-
-// Test when the reader goes over the mem limit
-TEST_F(DiskIoMgrTest, MemTrackers) {
-    const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
-    const char* data = "abcdefghijklm";
-    int len = strlen(data);
-    CreateTempFile(tmp_file, data);
-
-    // Get mtime for file
-    struct stat stat_val;
-    stat(tmp_file, &stat_val);
-
-    const int num_buffers = 25;
-    // Give the reader more buffers than the limit
-    const int mem_limit_num_buffers = 2;
-
-    int64_t iters = 0;
-    {
-        _pool.reset(new ObjectPool);
-        if (++iters % 1000 == 0) {
-            LOG(ERROR) << "Starting iteration " << iters;
-        }
-
-        DiskIoMgr io_mgr(1, 1, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
-
-        Status status = io_mgr.init(LARGE_MEM_LIMIT);
-        EXPECT_TRUE(status.ok());
-        DiskIoMgr::RequestContext* reader;
-        status = io_mgr.register_context(&reader);
-        EXPECT_TRUE(status.ok());
-
-        std::vector<DiskIoMgr::ScanRange*> ranges;
-        for (int i = 0; i < num_buffers; ++i) {
-            ranges.push_back(init_range(num_buffers, tmp_file, 0, len, 0, stat_val.st_mtime));
-        }
-        status = io_mgr.add_scan_ranges(reader, ranges);
-        EXPECT_TRUE(status.ok());
-
-        // Don't return buffers to force memory pressure
-        std::vector<DiskIoMgr::BufferDescriptor*> buffers;
-
-        std::atomic<int> num_ranges_processed;
-        scan_range_thread(&io_mgr, reader, data, strlen(data), Status::MemoryLimitExceeded("Mem"),
-                          1, &num_ranges_processed);
-
-        char result[strlen(data) + 1];
-        // Keep reading new ranges without returning buffers. This forces us
-        // to go over the limit eventually.
-        while (true) {
-            memset(result, 0, strlen(data) + 1);
-            DiskIoMgr::ScanRange* range = nullptr;
-            status = io_mgr.get_next_range(reader, &range);
-            EXPECT_TRUE(status.ok() || status.is_mem_limit_exceeded());
-            if (range == nullptr) break;
-
-            while (true) {
-                DiskIoMgr::BufferDescriptor* buffer = nullptr;
-                Status status = range->get_next(&buffer);
-                EXPECT_TRUE(status.ok() || status.is_mem_limit_exceeded());
-                if (buffer == nullptr) break;
-                memcpy(result + range->offset() + buffer->scan_range_offset(), buffer->buffer(),
-                       buffer->len());
-                buffers.push_back(buffer);
-            }
-            validate_empty_or_correct(data, result, strlen(data));
-        }
-
-        for (int i = 0; i < buffers.size(); ++i) {
-            buffers[i]->return_buffer();
-        }
-
-        EXPECT_TRUE(io_mgr.context_status(reader).is_mem_limit_exceeded());
-        io_mgr.unregister_context(reader);
-    }
-}
-#if 0
-// Test when some scan ranges are marked as being cached.
-// Since these files are not in HDFS, the cached path always fails so this
-// only tests the fallback mechanism.
-// TODO: we can fake the cached read path without HDFS
-TEST_F(DiskIoMgrTest, CachedReads) {
-    const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
-    const char* data = "abcdefghijklm";
-    int len = strlen(data);
-    CreateTempFile(tmp_file, data);
-
-    // Get mtime for file
-    struct stat stat_val;
-    stat(tmp_file, &stat_val);
-
-    const int num_disks = 2;
-    const int num_buffers = 3;
-
-    int64_t iters = 0;
-    {
-        _pool.reset(new ObjectPool);
-        if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
-        DiskIoMgr io_mgr(num_disks, 1, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
-
-        Status status = io_mgr.init(LARGE_MEM_LIMIT);
-        EXPECT_TRUE(status.ok());
-        DiskIoMgr::RequestContext* reader;
-        status = io_mgr.register_context(&reader);
-        EXPECT_TRUE(status.ok());
-
-        DiskIoMgr::ScanRange* complete_range =
-            init_range(1, tmp_file, 0, strlen(data), 0, stat_val.st_mtime, nullptr, true);
-
-        // Issue some reads before the async ones are issued
-        validate_sync_read(&io_mgr, reader, complete_range, data);
-        validate_sync_read(&io_mgr, reader, complete_range, data);
-
-        std::vector<DiskIoMgr::ScanRange*> ranges;
-        for (int i = 0; i < len; ++i) {
-            int disk_id = i % num_disks;
-            ranges.push_back(init_range(num_buffers, tmp_file, 0, len, disk_id,
-                        stat_val.st_mtime, nullptr, true));
-        }
-        status = io_mgr.add_scan_ranges(reader, ranges);
-        EXPECT_TRUE(status.ok());
-
-        std::atomic<int> num_ranges_processed;
-        ThreadGroup threads;
-        for (int i = 0; i < 5; ++i) {
-            threads.add_thread(new thread(scan_range_thread, &io_mgr, reader, data,
-                        strlen(data), Status::OK(), 0, &num_ranges_processed));
-        }
-
-        // Issue some more sync ranges
-        for (int i = 0; i < 5; ++i) {
-            sched_yield();
-            validate_sync_read(&io_mgr, reader, complete_range, data);
-        }
-
-        threads.join_all();
-
-        validate_sync_read(&io_mgr, reader, complete_range, data);
-        validate_sync_read(&io_mgr, reader, complete_range, data);
-
-        EXPECT_EQ(num_ranges_processed, ranges.size());
-        io_mgr.unregister_context(reader);
-    }
-    EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-#endif // end #if 0
-
-TEST_F(DiskIoMgrTest, MultipleReaderWriter) {
-    const int ITERATIONS = 1;
-    const char* data = "abcdefghijklmnopqrstuvwxyz";
-    const int num_contexts = 5;
-    const int file_size = 4 * 1024;
-    const int num_writes_queued = 5;
-    const int num_reads_queued = 5;
-
-    string file_name = "/tmp/disk_io_mgr_test.txt";
-    int success = CreateTempFile(file_name.c_str(), file_size);
-    if (success != 0) {
-        LOG(ERROR) << "Error creating temp file " << file_name.c_str() << " of size " << file_size;
-        EXPECT_TRUE(false);
-    }
-
-    // Get mtime for file
-    struct stat stat_val;
-    stat(file_name.c_str(), &stat_val);
-
-    int64_t iters = 0;
-    std::vector<DiskIoMgr::RequestContext*> contexts(num_contexts);
-    Status status;
-    for (int iteration = 0; iteration < ITERATIONS; ++iteration) {
-        for (int threads_per_disk = 1; threads_per_disk <= 5; ++threads_per_disk) {
-            for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-                DiskIoMgr io_mgr(num_disks, threads_per_disk, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
-                io_mgr.init(LARGE_MEM_LIMIT);
-                for (int file_index = 0; file_index < num_contexts; ++file_index) {
-                    status = io_mgr.register_context(&contexts[file_index]);
-                    EXPECT_TRUE(status.ok());
-                }
-                _pool.reset(new ObjectPool);
-                int read_offset = 0;
-                int write_offset = 0;
-                while (read_offset < file_size) {
-                    for (int context_index = 0; context_index < num_contexts; ++context_index) {
-                        if (++iters % 5000 == 0) {
-                            LOG(ERROR) << "Starting iteration " << iters;
-                        }
-                        std::atomic<int> num_ranges_processed;
-                        ThreadGroup threads;
-                        std::vector<DiskIoMgr::ScanRange*> ranges;
-                        int num_scan_ranges =
-                                std::min<int>(num_reads_queued, write_offset - read_offset);
-                        for (int i = 0; i < num_scan_ranges; ++i) {
-                            ranges.push_back(init_range(1, file_name.c_str(), read_offset, 1,
-                                                        i % num_disks, stat_val.st_mtime));
-                            threads.add_thread(new thread(
-                                    scan_range_thread, &io_mgr, contexts[context_index],
-                                    reinterpret_cast<const char*>(data +
-                                                                  (read_offset % strlen(data))),
-                                    1, Status::OK(), num_scan_ranges, &num_ranges_processed));
-                            ++read_offset;
-                        }
-
-                        _num_ranges_written = 0;
-                        int num_write_ranges =
-                                std::min<int>(num_writes_queued, file_size - write_offset);
-                        for (int i = 0; i < num_write_ranges; ++i) {
-                            DiskIoMgr::WriteRange::WriteDoneCallback callback =
-                                    bind(mem_fn(&DiskIoMgrTest::write_complete_callback), this,
-                                         num_write_ranges, _1);
-                            DiskIoMgr::WriteRange* new_range = _pool->add(new DiskIoMgr::WriteRange(
-                                    file_name, write_offset, i % num_disks, callback));
-                            new_range->set_data(reinterpret_cast<const uint8_t*>(
-                                                        data + (write_offset % strlen(data))),
-                                                1);
-                            status = io_mgr.add_write_range(contexts[context_index], new_range);
-                            ++write_offset;
-                        }
-
-                        {
-                            unique_lock<mutex> lock(_written_mutex);
-                            while (_num_ranges_written < num_write_ranges) {
-                                _writes_done.wait(lock);
-                            }
-                        }
-
-                        threads.join_all();
-                    } // for (int context_index
-                }     // while (read_offset < file_size)
-
-                for (int file_index = 0; file_index < num_contexts; ++file_index) {
-                    io_mgr.unregister_context(contexts[file_index]);
-                }
-            } // for (int num_disks
-        }     // for (int threads_per_disk
-    }         // for (int iteration
-}
-
-// This test will test multiple concurrent reads each reading a different file.
-TEST_F(DiskIoMgrTest, MultipleReader) {
-    const int NUM_READERS = 5;
-    const int DATA_LEN = 50;
-    const int ITERATIONS = 25;
-    const int NUM_THREADS_PER_READER = 3;
-
-    std::vector<string> file_names;
-    std::vector<int64_t> mtimes;
-    std::vector<string> data;
-    std::vector<DiskIoMgr::RequestContext*> readers;
-    std::vector<char*> results;
-
-    file_names.resize(NUM_READERS);
-    readers.resize(NUM_READERS);
-    mtimes.resize(NUM_READERS);
-    data.resize(NUM_READERS);
-    results.resize(NUM_READERS);
-
-    // Initialize data for each reader.  The data will be
-    // 'abcd...' for reader one, 'bcde...' for reader two (wrapping around at 'z')
-    for (int i = 0; i < NUM_READERS; ++i) {
-        char buf[DATA_LEN];
-        for (int j = 0; j < DATA_LEN; ++j) {
-            int c = (j + i) % 26;
-            buf[j] = 'a' + c;
-        }
-        data[i] = string(buf, DATA_LEN);
-
-        std::stringstream ss;
-        ss << "/tmp/disk_io_mgr_test" << i << ".txt";
-        file_names[i] = ss.str();
-        CreateTempFile(ss.str().c_str(), data[i].c_str());
-
-        // Get mtime for file
-        struct stat stat_val;
-        stat(file_names[i].c_str(), &stat_val);
-        mtimes[i] = stat_val.st_mtime;
-
-        results[i] = new char[DATA_LEN + 1];
-        memset(results[i], 0, DATA_LEN + 1);
-    }
-
-    // This exercises concurrency, run the test multiple times
-    int64_t iters = 0;
-    for (int iteration = 0; iteration < ITERATIONS; ++iteration) {
-        for (int threads_per_disk = 1; threads_per_disk <= 5; ++threads_per_disk) {
-            for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-                for (int num_buffers = 1; num_buffers <= 5; ++num_buffers) {
-                    _pool.reset(new ObjectPool);
-                    LOG(INFO) << "Starting test with num_threads_per_disk=" << threads_per_disk
-                              << " num_disk=" << num_disks << " num_buffers=" << num_buffers;
-                    if (++iters % 2500 == 0) LOG(ERROR) << "Starting iteration " << iters;
-
-                    DiskIoMgr io_mgr(num_disks, threads_per_disk, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
-                    Status status = io_mgr.init(LARGE_MEM_LIMIT);
-                    EXPECT_TRUE(status.ok());
-
-                    for (int i = 0; i < NUM_READERS; ++i) {
-                        status = io_mgr.register_context(&readers[i], nullptr);
-                        EXPECT_TRUE(status.ok());
-
-                        std::vector<DiskIoMgr::ScanRange*> ranges;
-                        for (int j = 0; j < DATA_LEN; ++j) {
-                            int disk_id = j % num_disks;
-                            ranges.push_back(init_range(num_buffers, file_names[i].c_str(), j, 1,
-                                                        disk_id, mtimes[i]));
-                        }
-                        status = io_mgr.add_scan_ranges(readers[i], ranges);
-                        EXPECT_TRUE(status.ok());
-                    }
-
-                    std::atomic<int> num_ranges_processed;
-                    ThreadGroup threads;
-                    for (int i = 0; i < NUM_READERS; ++i) {
-                        for (int j = 0; j < NUM_THREADS_PER_READER; ++j) {
-                            threads.add_thread(new thread(scan_range_thread, &io_mgr, readers[i],
-                                                          data[i].c_str(), data[i].size(),
-                                                          Status::OK(), 0, &num_ranges_processed));
-                        }
-                    }
-                    threads.join_all();
-                    EXPECT_EQ(num_ranges_processed, DATA_LEN * NUM_READERS);
-                    for (int i = 0; i < NUM_READERS; ++i) {
-                        io_mgr.unregister_context(readers[i]);
-                    }
-                }
-            }
-        }
-    }
-    EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-
-#if 0
-// Stress test for multiple clients with cancellation
-// TODO: the stress app should be expanded to include sync reads and adding scan
-// ranges in the middle.
-TEST_F(DiskIoMgrTest, StressTest) {
-  // Run the test with 5 disks, 5 threads per disk, 10 clients and with cancellation
-  DiskIoMgrStress test(5, 5, 10, true);
-  test.Run(2); // In seconds
-}
-#endif
-
-TEST_F(DiskIoMgrTest, Buffers) {
-    // Test default min/max buffer size
-    int min_buffer_size = 1024;
-    int max_buffer_size = 8 * 1024 * 1024; // 8 MB
-
-    DiskIoMgr io_mgr(1, 1, min_buffer_size, max_buffer_size);
-    Status status = io_mgr.init(max_buffer_size * 2);
-    EXPECT_TRUE(status.ok());
-    EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-
-    // buffer length should be rounded up to min buffer size
-    int64_t buffer_len = 1;
-    char* buf = io_mgr.get_free_buffer(&buffer_len);
-    EXPECT_EQ(buffer_len, min_buffer_size);
-    EXPECT_EQ(io_mgr._num_allocated_buffers, 1);
-    io_mgr.return_free_buffer(buf, buffer_len);
-    EXPECT_EQ(io_mgr->mem_tracker()->consumption(), min_buffer_size);
-
-    // reuse buffer
-    buffer_len = min_buffer_size;
-    buf = io_mgr.get_free_buffer(&buffer_len);
-    EXPECT_EQ(buffer_len, min_buffer_size);
-    EXPECT_EQ(io_mgr._num_allocated_buffers, 1);
-    io_mgr.return_free_buffer(buf, buffer_len);
-    EXPECT_EQ(io_mgr->mem_tracker()->consumption(), min_buffer_size);
-
-    // bump up to next buffer size
-    buffer_len = min_buffer_size + 1;
-    buf = io_mgr.get_free_buffer(&buffer_len);
-    EXPECT_EQ(buffer_len, min_buffer_size * 2);
-    EXPECT_EQ(io_mgr._num_allocated_buffers, 2);
-    EXPECT_EQ(io_mgr->mem_tracker()->consumption(), min_buffer_size * 3);
-
-    // gc unused buffer
-    io_mgr.gc_io_buffers();
-    EXPECT_EQ(io_mgr._num_allocated_buffers, 1);
-    EXPECT_EQ(io_mgr->mem_tracker()->consumption(), min_buffer_size * 2);
-
-    io_mgr.return_free_buffer(buf, buffer_len);
-
-    // max buffer size
-    buffer_len = max_buffer_size;
-    buf = io_mgr.get_free_buffer(&buffer_len);
-    EXPECT_EQ(buffer_len, max_buffer_size);
-    EXPECT_EQ(io_mgr._num_allocated_buffers, 2);
-    io_mgr.return_free_buffer(buf, buffer_len);
-    EXPECT_EQ(io_mgr->mem_tracker()->consumption(), min_buffer_size * 2 + max_buffer_size);
-
-    // gc buffers
-    io_mgr.gc_io_buffers();
-    EXPECT_EQ(io_mgr._num_allocated_buffers, 0);
-    EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-
-// IMPALA-2366: handle partial read where range goes past end of file.
-TEST_F(DiskIoMgrTest, PartialRead) {
-    const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
-    const char* data = "the quick brown fox jumped over the lazy dog";
-    int len = strlen(data);
-    int read_len = len + 1000; // Read past end of file.
-    CreateTempFile(tmp_file, data);
-
-    // Get mtime for file
-    struct stat stat_val;
-    stat(tmp_file, &stat_val);
-
-    _pool.reset(new ObjectPool);
-    std::unique_ptr<DiskIoMgr> io_mgr(new DiskIoMgr(1, 1, read_len, read_len));
-
-    Status status = io_mgr->init(LARGE_MEM_LIMIT);
-    EXPECT_TRUE(status.ok());
-    DiskIoMgr::RequestContext* reader;
-    status = io_mgr->register_context(&reader);
-    EXPECT_TRUE(status.ok());
-
-    // We should not read past the end of file.
-    DiskIoMgr::ScanRange* range = init_range(1, tmp_file, 0, read_len, 0, stat_val.st_mtime);
-    DiskIoMgr::BufferDescriptor* buffer;
-    status = io_mgr->read(reader, range, &buffer);
-    EXPECT_TRUE(status.ok());
-    EXPECT_TRUE(buffer->eosr());
-    EXPECT_EQ(len, buffer->len());
-    EXPECT_TRUE(memcmp(buffer->buffer(), data, len) == 0);
-    buffer->return_buffer();
-
-    io_mgr->unregister_context(reader);
-    _pool.reset();
-    io_mgr.reset();
-    EXPECT_EQ(io_mgr->mem_tracker()->consumption(), 0);
-}
-
-} // end namespace doris
diff --git a/be/test/runtime/test_env.cc b/be/test/runtime/test_env.cc
index 2c40d4432f..f33b6b5238 100644
--- a/be/test/runtime/test_env.cc
+++ b/be/test/runtime/test_env.cc
@@ -22,7 +22,6 @@
 #include <memory>
 
 #include "olap/storage_engine.h"
-#include "runtime/bufferpool/buffer_pool.h"
 #include "runtime/fragment_mgr.h"
 #include "runtime/result_queue_mgr.h"
 #include "util/disk_info.h"
@@ -34,8 +33,6 @@ TestEnv::TestEnv() {
     // Some code will use ExecEnv::GetInstance(), so init the global ExecEnv singleton
     _exec_env = ExecEnv::GetInstance();
     _exec_env->_thread_mgr = new ThreadResourceMgr(2);
-    _exec_env->_disk_io_mgr = new DiskIoMgr(1, 1, 1, 10);
-    _exec_env->disk_io_mgr()->init(-1);
     _exec_env->_result_queue_mgr = new ResultQueueMgr();
     // TODO may need rpc support, etc.
 }
@@ -50,14 +47,8 @@ void TestEnv::init_tmp_file_mgr(const std::vector<std::string>& tmp_dirs, bool o
     DCHECK(st.ok()) << st;
 }
 
-void TestEnv::init_buffer_pool(int64_t min_page_len, int64_t capacity, int64_t clean_pages_limit) {
-    _exec_env->_buffer_pool = new BufferPool(min_page_len, capacity, clean_pages_limit);
-}
-
 TestEnv::~TestEnv() {
     SAFE_DELETE(_exec_env->_result_queue_mgr);
-    SAFE_DELETE(_exec_env->_buffer_pool);
-    SAFE_DELETE(_exec_env->_disk_io_mgr);
     SAFE_DELETE(_exec_env->_thread_mgr);
 
     if (_engine == StorageEngine::_s_instance) {
diff --git a/be/test/runtime/test_env.h b/be/test/runtime/test_env.h
index ea034ebd19..a6baae9d27 100644
--- a/be/test/runtime/test_env.h
+++ b/be/test/runtime/test_env.h
@@ -18,7 +18,6 @@
 #ifndef DORIS_BE_TEST_QUERY_RUNTIME_TEST_ENV_H
 #define DORIS_BE_TEST_QUERY_RUNTIME_TEST_ENV_H
 
-#include "runtime/disk_io_mgr.h"
 #include "runtime/exec_env.h"
 #include "runtime/runtime_state.h"
 #include "runtime/tmp_file_mgr.h"
@@ -37,8 +36,6 @@ public:
     // query states have been created.
     void init_tmp_file_mgr(const std::vector<std::string>& tmp_dirs, bool one_dir_per_device);
 
-    void init_buffer_pool(int64_t min_page_len, int64_t capacity, int64_t clean_pages_limit);
-
     // If don't need to open, paths can be empty.
     void init_storage_engine(bool need_open, const std::vector<std::string>& paths = {});
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org