You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by zo...@apache.org on 2023/06/28 08:49:53 UTC

[doris] branch master updated: [fix](memory) Purge Jemalloc arena dirty pages when memory insufficient (#21237)

This is an automated email from the ASF dual-hosted git repository.

zouxinyi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d2c42ec638 [fix](memory) Purge Jemalloc arena dirty pages when memory insufficient (#21237)
d2c42ec638 is described below

commit d2c42ec638f788d6eb23fc02ba45584ca198c8f0
Author: Xinyi Zou <zo...@gmail.com>
AuthorDate: Wed Jun 28 16:49:45 2023 +0800

    [fix](memory) Purge Jemalloc arena dirty pages when memory insufficient (#21237)
    
    Jemalloc dirty page only use madvise MADV_FREE, memory is not release back to system, RSS won't reduce in time,
    
    So when the process memory exceed limit or system available memory is insufficient,
    manually transfer dirty page to the muzzy page, which will call MADV_DONTNEED to release the physical memory back to the system.
    
    https://jemalloc.net/jemalloc.3.html#opt.dirty_decay_ms
---
 be/src/common/daemon.cpp                      |  4 +--
 be/src/runtime/memory/mem_tracker_limiter.cpp |  2 +-
 be/src/util/mem_info.cpp                      | 17 ++++++++----
 be/src/util/mem_info.h                        | 37 +++++++++++++++++++++++++++
 be/src/util/system_metrics.cpp                | 30 ++++++++++++++++++++++
 5 files changed, 82 insertions(+), 8 deletions(-)

diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp
index 7c64596bac..cfb002a4c4 100644
--- a/be/src/common/daemon.cpp
+++ b/be/src/common/daemon.cpp
@@ -245,7 +245,7 @@ void Daemon::memory_gc_thread() {
             // No longer full gc and minor gc during sleep.
             memory_full_gc_sleep_time_ms = config::memory_gc_sleep_time_ms;
             memory_minor_gc_sleep_time_ms = config::memory_gc_sleep_time_ms;
-            doris::MemTrackerLimiter::print_log_process_usage("process full gc", false);
+            doris::MemTrackerLimiter::print_log_process_usage("Start Full GC", false);
             if (doris::MemInfo::process_full_gc()) {
                 // If there is not enough memory to be gc, the process memory usage will not be printed in the next continuous gc.
                 doris::MemTrackerLimiter::enable_print_log_process_usage();
@@ -255,7 +255,7 @@ void Daemon::memory_gc_thread() {
                     proc_mem_no_allocator_cache >= doris::MemInfo::soft_mem_limit())) {
             // No minor gc during sleep, but full gc is possible.
             memory_minor_gc_sleep_time_ms = config::memory_gc_sleep_time_ms;
-            doris::MemTrackerLimiter::print_log_process_usage("process minor gc", false);
+            doris::MemTrackerLimiter::print_log_process_usage("Start Minor GC", false);
             if (doris::MemInfo::process_minor_gc()) {
                 doris::MemTrackerLimiter::enable_print_log_process_usage();
             }
diff --git a/be/src/runtime/memory/mem_tracker_limiter.cpp b/be/src/runtime/memory/mem_tracker_limiter.cpp
index d03bd1ac00..683971ecac 100644
--- a/be/src/runtime/memory/mem_tracker_limiter.cpp
+++ b/be/src/runtime/memory/mem_tracker_limiter.cpp
@@ -137,7 +137,7 @@ void MemTrackerLimiter::make_process_snapshots(std::vector<MemTracker::Snapshot>
         process_mem_sum += it.second->current_value();
     }
 
-    snapshot.type = "tc/jemalloc_cache";
+    snapshot.type = "tc/jemalloc_free_memory";
     snapshot.label = "";
     snapshot.limit = -1;
     snapshot.cur_consumption = MemInfo::allocator_cache_mem();
diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp
index f50f41e198..200d346ded 100644
--- a/be/src/util/mem_info.cpp
+++ b/be/src/util/mem_info.cpp
@@ -77,14 +77,18 @@ int64_t MemInfo::_s_process_full_gc_size = -1;
 void MemInfo::refresh_allocator_mem() {
 #if defined(ADDRESS_SANITIZER) || defined(LEAK_SANITIZER) || defined(THREAD_SANITIZER)
 #elif defined(USE_JEMALLOC)
+    // 'epoch' is a special mallctl -- it updates the statistics. Without it, all
+    // the following calls will return stale values. It increments and returns
+    // the current epoch number, which might be useful to log as a sanity check.
     uint64_t epoch = 0;
     size_t sz = sizeof(epoch);
     jemallctl("epoch", &epoch, &sz, &epoch, sz);
 
     // https://jemalloc.net/jemalloc.3.html
-    _s_allocator_cache_mem =
-            get_je_metrics(fmt::format("stats.arenas.{}.tcache_bytes", MALLCTL_ARENAS_ALL)) +
-            get_je_metrics("stats.metadata");
+    // https://www.bookstack.cn/read/aliyun-rds-core/4a0cdf677f62feb3.md
+    _s_allocator_cache_mem = get_je_all_arena_metrics("tcache_bytes") +
+                             get_je_metrics("stats.metadata") +
+                             get_je_all_arena_metrics("pdirty") * get_page_size();
     _s_allocator_cache_mem_str =
             PrettyPrinter::print(static_cast<uint64_t>(_s_allocator_cache_mem), TUnit::BYTES);
     _s_virtual_memory_used = get_je_metrics("stats.mapped");
@@ -125,6 +129,7 @@ void MemInfo::process_cache_gc(int64_t& freed_mem) {
                 segment_v2::PRIMARY_KEY_INDEX_PAGE);
         StoragePageCache::instance()->prune(segment_v2::PRIMARY_KEY_INDEX_PAGE);
     }
+    je_purge_all_arena_dirty_pages();
 }
 
 // step1: free all cache
@@ -139,7 +144,8 @@ bool MemInfo::process_minor_gc() {
     std::string mem_available_str = MemInfo::sys_mem_available_str();
 
     Defer defer {[&]() {
-        LOG(INFO) << fmt::format("Process Minor GC Free Memory {} Bytes. cost(us): {}", freed_mem,
+        je_purge_all_arena_dirty_pages();
+        LOG(INFO) << fmt::format("End Minor GC, Free Memory {} Bytes. cost(us): {}", freed_mem,
                                  watch.elapsed_time() / 1000);
     }};
 
@@ -181,7 +187,8 @@ bool MemInfo::process_full_gc() {
     std::string mem_available_str = MemInfo::sys_mem_available_str();
 
     Defer defer {[&]() {
-        LOG(INFO) << fmt::format("Process Full GC Free Memory {} Bytes. cost(us): {}", freed_mem,
+        je_purge_all_arena_dirty_pages();
+        LOG(INFO) << fmt::format("End Full GC Free, Memory {} Bytes. cost(us): {}", freed_mem,
                                  watch.elapsed_time() / 1000);
     }};
 
diff --git a/be/src/util/mem_info.h b/be/src/util/mem_info.h
index 12c70d8cc4..89a66b0658 100644
--- a/be/src/util/mem_info.h
+++ b/be/src/util/mem_info.h
@@ -26,6 +26,12 @@
 #include <atomic>
 #include <string>
 
+#if !defined(__APPLE__) || !defined(_POSIX_C_SOURCE)
+#include <unistd.h>
+#else
+#include <mach/vm_page_size.h>
+#endif
+
 #include "common/logging.h"
 #ifdef USE_JEMALLOC
 #include "jemalloc/jemalloc.h"
@@ -46,6 +52,14 @@ public:
 
     static inline bool initialized() { return _s_initialized; }
 
+    static int get_page_size() {
+#if !defined(__APPLE__) || !defined(_POSIX_C_SOURCE)
+        return getpagesize();
+#else
+        return vm_page_size;
+#endif
+    }
+
     // Get total physical memory in bytes (if has cgroups memory limits, return the limits).
     static inline int64_t physical_mem() {
         DCHECK(_s_initialized);
@@ -83,6 +97,22 @@ public:
 #endif
         return 0;
     }
+
+    static inline int64_t get_je_all_arena_metrics(const std::string& name) {
+#ifdef USE_JEMALLOC
+        return get_je_metrics(fmt::format("stats.arenas.{}.{}", MALLCTL_ARENAS_ALL, name));
+#endif
+        return 0;
+    }
+
+    static inline void je_purge_all_arena_dirty_pages() {
+#ifdef USE_JEMALLOC
+        // Purge all unused dirty pages for arena <i>, or for all arenas if <i> equals MALLCTL_ARENAS_ALL.
+        jemallctl(fmt::format("arena.{}.purge", MALLCTL_ARENAS_ALL).c_str(), nullptr, nullptr,
+                  nullptr, 0);
+#endif
+    }
+
     static inline size_t allocator_virtual_mem() { return _s_virtual_memory_used; }
     static inline size_t allocator_cache_mem() { return _s_allocator_cache_mem; }
     static inline std::string allocator_cache_mem_str() { return _s_allocator_cache_mem_str; }
@@ -94,6 +124,13 @@ public:
     // obtained by the process malloc, not the physical memory actually used by the process in the OS.
     static void refresh_allocator_mem();
 
+    /** jemalloc pdirty is number of pages within unused extents that are potentially
+      * dirty, and for which madvise() or similar has not been called.
+      *
+      * So they will be subtracted from RSS to make accounting more
+      * accurate, since those pages are not really RSS but a memory
+      * that can be used at anytime via jemalloc.
+      */
     static inline void refresh_proc_mem_no_allocator_cache() {
         _s_proc_mem_no_allocator_cache =
                 PerfCounters::get_vm_rss() - static_cast<int64_t>(_s_allocator_cache_mem);
diff --git a/be/src/util/system_metrics.cpp b/be/src/util/system_metrics.cpp
index fa8f5a181a..ee7db9494c 100644
--- a/be/src/util/system_metrics.cpp
+++ b/be/src/util/system_metrics.cpp
@@ -117,6 +117,12 @@ DEFINE_MEMORY_GAUGE_METRIC(jemalloc_metadata_bytes, MetricUnit::BYTES);
 DEFINE_MEMORY_GAUGE_METRIC(jemalloc_resident_bytes, MetricUnit::BYTES);
 DEFINE_MEMORY_GAUGE_METRIC(jemalloc_mapped_bytes, MetricUnit::BYTES);
 DEFINE_MEMORY_GAUGE_METRIC(jemalloc_retained_bytes, MetricUnit::BYTES);
+DEFINE_MEMORY_GAUGE_METRIC(jemalloc_tcache_bytes, MetricUnit::BYTES);
+DEFINE_MEMORY_GAUGE_METRIC(jemalloc_pactive_num, MetricUnit::NOUNIT);
+DEFINE_MEMORY_GAUGE_METRIC(jemalloc_pdirty_num, MetricUnit::NOUNIT);
+DEFINE_MEMORY_GAUGE_METRIC(jemalloc_pmuzzy_num, MetricUnit::NOUNIT);
+DEFINE_MEMORY_GAUGE_METRIC(jemalloc_dirty_purged_num, MetricUnit::NOUNIT);
+DEFINE_MEMORY_GAUGE_METRIC(jemalloc_muzzy_purged_num, MetricUnit::NOUNIT);
 #endif
 
 struct MemoryMetrics {
@@ -142,6 +148,12 @@ struct MemoryMetrics {
         INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_resident_bytes);
         INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_mapped_bytes);
         INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_retained_bytes);
+        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_tcache_bytes);
+        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_pactive_num);
+        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_pdirty_num);
+        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_pmuzzy_num);
+        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_dirty_purged_num);
+        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_muzzy_purged_num);
 #endif
     }
 
@@ -167,6 +179,12 @@ struct MemoryMetrics {
     IntGauge* memory_jemalloc_resident_bytes;
     IntGauge* memory_jemalloc_mapped_bytes;
     IntGauge* memory_jemalloc_retained_bytes;
+    IntGauge* memory_jemalloc_tcache_bytes;
+    IntGauge* memory_jemalloc_pactive_num;
+    IntGauge* memory_jemalloc_pdirty_num;
+    IntGauge* memory_jemalloc_pmuzzy_num;
+    IntGauge* memory_jemalloc_dirty_purged_num;
+    IntGauge* memory_jemalloc_muzzy_purged_num;
 #endif
 };
 
@@ -457,6 +475,18 @@ void SystemMetrics::update_allocator_metrics() {
             MemInfo::get_je_metrics("stats.mapped"));
     _memory_metrics->memory_jemalloc_retained_bytes->set_value(
             MemInfo::get_je_metrics("stats.retained"));
+    _memory_metrics->memory_jemalloc_tcache_bytes->set_value(
+            MemInfo::get_je_all_arena_metrics("tcache_bytes"));
+    _memory_metrics->memory_jemalloc_pactive_num->set_value(
+            MemInfo::get_je_all_arena_metrics("pactive"));
+    _memory_metrics->memory_jemalloc_pdirty_num->set_value(
+            MemInfo::get_je_all_arena_metrics("pdirty"));
+    _memory_metrics->memory_jemalloc_pmuzzy_num->set_value(
+            MemInfo::get_je_all_arena_metrics("pmuzzy"));
+    _memory_metrics->memory_jemalloc_dirty_purged_num->set_value(
+            MemInfo::get_je_all_arena_metrics("dirty_purged"));
+    _memory_metrics->memory_jemalloc_muzzy_purged_num->set_value(
+            MemInfo::get_je_all_arena_metrics("muzzy_purged"));
 #else
     _memory_metrics->memory_tcmalloc_allocated_bytes->set_value(
             MemInfo::get_tc_metrics("generic.total_physical_bytes"));


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org