You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pegasus.apache.org by wa...@apache.org on 2023/02/07 07:45:26 UTC

[incubator-pegasus] branch master updated: feat(new_metrics): retire stale metric entities that are not used by any other object (#1304)

This is an automated email from the ASF dual-hosted git repository.

wangdan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pegasus.git


The following commit(s) were added to refs/heads/master by this push:
     new 624cae5ac feat(new_metrics): retire stale metric entities that are not used by any other object (#1304)
624cae5ac is described below

commit 624cae5ac922dee96c20629d1bd35b0cc3b62e2f
Author: Dan Wang <wa...@apache.org>
AuthorDate: Tue Feb 7 15:45:20 2023 +0800

    feat(new_metrics): retire stale metric entities that are not used by any other object (#1304)
---
 src/utils/metrics.cpp           | 191 +++++++++++++++++++++++++++++++----
 src/utils/metrics.h             | 193 ++++++++++++++++++++++++++----------
 src/utils/test/metrics_test.cpp | 215 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 529 insertions(+), 70 deletions(-)

diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp
index 6e83324d6..007fb2198 100644
--- a/src/utils/metrics.cpp
+++ b/src/utils/metrics.cpp
@@ -17,7 +17,9 @@
 
 #include "utils/metrics.h"
 
+#include "runtime/api_layer1.h"
 #include "utils/api_utilities.h"
+#include "utils/flags.h"
 #include "utils/rand.h"
 #include "utils/shared_io_service.h"
 #include "utils/string_conv.h"
@@ -25,10 +27,15 @@
 
 namespace dsn {
 
+DSN_DEFINE_uint64(metrics,
+                  entity_retirement_delay_ms,
+                  10 * 60 * 1000,
+                  "The retention internal (milliseconds) for an entity after it becomes stale.");
+
 metric_entity::metric_entity(const metric_entity_prototype *prototype,
                              const std::string &id,
                              const attr_map &attrs)
-    : _prototype(prototype), _id(id), _attrs(attrs)
+    : _prototype(prototype), _id(id), _attrs(attrs), _retire_time_ms(0)
 {
 }
 
@@ -45,13 +52,10 @@ void metric_entity::close(close_option option)
 {
     utils::auto_write_lock l(_lock);
 
-    // The reason why each metric is closed in the entity rather than in the destructor of each
-    // metric is that close() for the metric will return immediately without waiting for any close
-    // operation to be finished.
-    //
-    // Thus, to close all metrics owned by an entity, it's more efficient to firstly issue a close
-    // request for all metrics; then, just wait for all of the close operations to be finished.
-    // It's inefficient to wait for each metric to be closed one by one.
+    // To close all metrics owned by an entity, it's more efficient to firstly issue an asynchronous
+    // close request to each metric; then, just wait for all of the close operations to be finished.
+    // It's inefficient to wait for each metric to be closed one by one. Therefore, the metric is
+    // not closed in its destructor.
     for (auto &m : _metrics) {
         if (m.second->prototype()->type() == metric_type::kPercentile) {
             auto p = down_cast<closeable_metric *>(m.second.get());
@@ -177,6 +181,17 @@ void metric_entity::take_snapshot(metric_json_writer &writer, const metric_filte
     writer.EndObject();
 }
 
+bool metric_entity::is_stale() const
+{
+    // Since this entity itself is still being accessed, its reference count should be 1
+    // at least.
+    CHECK_GE(get_count(), 1);
+
+    // This entity is considered stale once there is only one reference for it kept in the
+    // registry.
+    return get_count() == 1;
+}
+
 void metric_filters::extract_entity_metrics(const metric_entity::metric_map &candidates,
                                             metric_entity::metric_map &target_metrics) const
 {
@@ -312,10 +327,12 @@ metric_registry::metric_registry() : _http_service(this)
 {
     // We should ensure that metric_registry is destructed before shared_io_service is destructed.
     // Once shared_io_service is destructed before metric_registry is destructed,
-    // boost::asio::io_service needed by metrics in metric_registry such as percentile_timer will
+    // boost::asio::io_service needed by metrics in metric_registry such as metric_timer will
     // be released firstly, then will lead to heap-use-after-free error since percentiles in
     // metric_registry are still running but the resources they needed have been released.
     tools::shared_io_service::instance();
+
+    start_timer();
 }
 
 metric_registry::~metric_registry()
@@ -336,6 +353,39 @@ metric_registry::~metric_registry()
     for (auto &entity : _entities) {
         entity.second->close(metric_entity::close_option::kNoWait);
     }
+
+    stop_timer();
+}
+
+void metric_registry::on_close() {}
+
+void metric_registry::start_timer()
+{
+    if (_timer) {
+        return;
+    }
+
+    // Once an entity is considered stale, it will be retired after the retention interval,
+    // namely FLAGS_entity_retirement_delay_ms milliseconds. Therefore, if the interval of
+    // the timer is also set to FLAGS_entity_retirement_delay_ms, in the next round, it's
+    // just about time to retire this entity.
+    _timer.reset(new metric_timer(FLAGS_entity_retirement_delay_ms,
+                                  std::bind(&metric_registry::process_stale_entities, this),
+                                  std::bind(&metric_registry::on_close, this)));
+}
+
+void metric_registry::stop_timer()
+{
+    if (!_timer) {
+        return;
+    }
+
+    // Close the timer synchronously.
+    _timer->close();
+    _timer->wait();
+
+    // Reset the timer to mark that it has been stopped, now it could be started.
+    _timer.reset();
 }
 
 metric_registry::entity_map metric_registry::entities() const
@@ -383,6 +433,111 @@ metric_entity_ptr metric_registry::find_or_create_entity(const metric_entity_pro
     return entity;
 }
 
+metric_registry::collected_entities_info metric_registry::collect_stale_entities() const
+{
+    collected_entities_info collected_info;
+
+    auto now = dsn_now_ms();
+
+    utils::auto_read_lock l(_lock);
+
+    for (const auto &entity : _entities) {
+        if (!entity.second->is_stale()) {
+            if (entity.second->_retire_time_ms > 0) {
+                // This entity had been scheduled to be retired. However, it was reemployed
+                // after that. It has been in use since then, therefore its scheduled time
+                // for retirement should be reset to 0.
+                collected_info.collected_entities.insert(entity.first);
+            }
+            continue;
+        }
+
+        if (entity.second->_retire_time_ms > now) {
+            // This entity has been scheduled to be retired, however it is still within
+            // the retention interval. Thus do not collect it.
+            ++collected_info.num_scheduled_entities;
+            continue;
+        }
+
+        collected_info.collected_entities.insert(entity.first);
+    }
+
+    collected_info.num_all_entities = _entities.size();
+    return collected_info;
+}
+
+metric_registry::retired_entities_stat
+metric_registry::retire_stale_entities(const collected_entity_list &collected_entities)
+{
+    if (collected_entities.empty()) {
+        // Do not lock for empty list.
+        return retired_entities_stat();
+    }
+
+    retired_entities_stat retired_stat;
+
+    auto now = dsn_now_ms();
+
+    utils::auto_write_lock l(_lock);
+
+    for (const auto &collected_entity : collected_entities) {
+        auto iter = _entities.find(collected_entity);
+        if (dsn_unlikely(iter == _entities.end())) {
+            // The entity has been removed from the registry for some unusual reason.
+            continue;
+        }
+
+        if (!iter->second->is_stale()) {
+            if (iter->second->_retire_time_ms > 0) {
+                // For those entities which are reemployed, their scheduled time for retirement
+                // should be reset to 0 though previously they could have been scheduled to be
+                // retired.
+                iter->second->_retire_time_ms = 0;
+                ++retired_stat.num_reemployed_entities;
+            }
+            continue;
+        }
+
+        if (dsn_unlikely(iter->second->_retire_time_ms > now)) {
+            // Since in collect_stale_entities() we've filtered the metrics which have been
+            // outside the retention interval, this is unlikely to happen. However, we still
+            // check here.
+            continue;
+        }
+
+        if (iter->second->_retire_time_ms == 0) {
+            // The entity should be marked with a scheduled time for retirement, since it has
+            // already been considered stale.
+            iter->second->_retire_time_ms = now + FLAGS_entity_retirement_delay_ms;
+            ++retired_stat.num_recently_scheduled_entities;
+            continue;
+        }
+
+        // Once the entity is outside the retention interval, retire it from the registry.
+        _entities.erase(iter);
+        ++retired_stat.num_retired_entities;
+    }
+
+    return retired_stat;
+}
+
+void metric_registry::process_stale_entities()
+{
+    LOG_INFO("begin to process stale metric entities");
+
+    const auto &collected_info = collect_stale_entities();
+    const auto &retired_stat = retire_stale_entities(collected_info.collected_entities);
+
+    LOG_INFO("stat for metric entities: total={}, collected={}, retired={}, scheduled={}, "
+             "recently_scheduled={}, reemployed={}",
+             collected_info.num_all_entities,
+             collected_info.collected_entities.size(),
+             retired_stat.num_retired_entities,
+             collected_info.num_scheduled_entities,
+             retired_stat.num_recently_scheduled_entities,
+             retired_stat.num_reemployed_entities);
+}
+
 metric_prototype::metric_prototype(const ctor_args &args) : _args(args) {}
 
 metric_prototype::~metric_prototype() {}
@@ -391,7 +546,7 @@ metric::metric(const metric_prototype *prototype) : _prototype(prototype) {}
 
 closeable_metric::closeable_metric(const metric_prototype *prototype) : metric(prototype) {}
 
-uint64_t percentile_timer::generate_initial_delay_ms(uint64_t interval_ms)
+uint64_t metric_timer::generate_initial_delay_ms(uint64_t interval_ms)
 {
     CHECK_GT(interval_ms, 0);
 
@@ -403,7 +558,7 @@ uint64_t percentile_timer::generate_initial_delay_ms(uint64_t interval_ms)
     return (rand::next_u64() % interval_seconds + 1) * 1000 + rand::next_u64() % 1000;
 }
 
-percentile_timer::percentile_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close)
+metric_timer::metric_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close)
     : _initial_delay_ms(generate_initial_delay_ms(interval_ms)),
       _interval_ms(interval_ms),
       _on_exec(on_exec),
@@ -413,10 +568,10 @@ percentile_timer::percentile_timer(uint64_t interval_ms, on_exec_fn on_exec, on_
       _timer(new boost::asio::deadline_timer(tools::shared_io_service::instance().ios))
 {
     _timer->expires_from_now(boost::posix_time::milliseconds(_initial_delay_ms));
-    _timer->async_wait(std::bind(&percentile_timer::on_timer, this, std::placeholders::_1));
+    _timer->async_wait(std::bind(&metric_timer::on_timer, this, std::placeholders::_1));
 }
 
-void percentile_timer::close()
+void metric_timer::close()
 {
     // If the timer has already expired when cancel() is called, then the handlers for asynchronous
     // wait operations will:
@@ -433,15 +588,15 @@ void percentile_timer::close()
     }
 }
 
-void percentile_timer::wait() { _completed.wait(); }
+void metric_timer::wait() { _completed.wait(); }
 
-void percentile_timer::on_close()
+void metric_timer::on_close()
 {
     _on_close();
     _completed.notify();
 }
 
-void percentile_timer::on_timer(const boost::system::error_code &ec)
+void metric_timer::on_timer(const boost::system::error_code &ec)
 {
 // This macro is defined for the case that handlers for asynchronous wait operations are no
 // longer cancelled. It just checks the internal state atomically (since close() can also be
@@ -465,7 +620,7 @@ void percentile_timer::on_timer(const boost::system::error_code &ec)
         // Cancel can only be launched by close().
         auto expected_state = state::kClosing;
         CHECK(_state.compare_exchange_strong(expected_state, state::kClosed),
-              "wrong state for percentile_timer: {}, while expecting closing state",
+              "wrong state for metric_timer: {}, while expecting closing state",
               static_cast<int>(expected_state));
         on_close();
 
@@ -477,7 +632,7 @@ void percentile_timer::on_timer(const boost::system::error_code &ec)
 
     TRY_PROCESS_TIMER_CLOSING();
     _timer->expires_from_now(boost::posix_time::milliseconds(_interval_ms));
-    _timer->async_wait(std::bind(&percentile_timer::on_timer, this, std::placeholders::_1));
+    _timer->async_wait(std::bind(&metric_timer::on_timer, this, std::placeholders::_1));
 #undef TRY_PROCESS_TIMER_CLOSING
 }
 
diff --git a/src/utils/metrics.h b/src/utils/metrics.h
index c7c1bda18..340d2fd9d 100644
--- a/src/utils/metrics.h
+++ b/src/utils/metrics.h
@@ -171,6 +171,7 @@ public:
 private:
     friend class metric_registry;
     friend class ref_ptr<metric_entity>;
+    friend class scoped_entity;
 
     metric_entity(const metric_entity_prototype *prototype,
                   const std::string &id,
@@ -197,6 +198,18 @@ private:
 
     void encode_id(metric_json_writer &writer) const;
 
+    // Decide if an entity is stale. An entity becomes stale if it is no longer used by any other
+    // object.
+    //
+    // An entity could be bound to one or multiple objects. Once all of these objects are
+    // destroyed, this entity will become stale, which means all of the metrics held by this
+    // entity are also stale.
+    //
+    // For example, once a replica is removed, the replica entity (and all metrics it holds) will
+    // become stale; then, this entity is scheduled to be retired after a configurable retention
+    // interval; finally, this entity will be removed from the registry with all metrics it holds.
+    bool is_stale() const;
+
     const metric_entity_prototype *const _prototype;
     const std::string _id;
 
@@ -204,6 +217,12 @@ private:
     attr_map _attrs;
     metric_map _metrics;
 
+    // The timestamp when this entity should be retired:
+    // * default value is 0, which means this entity has not been scheduled to be retired;
+    // * otherwise, non-zero value means this entity has been scheduled to be retired, and will
+    // be retired at any time once current time has reached or exceeded this timestamp.
+    uint64_t _retire_time_ms;
+
     DISALLOW_COPY_AND_ASSIGN(metric_entity);
 };
 
@@ -354,10 +373,94 @@ private:
     DISALLOW_COPY_AND_ASSIGN(metrics_http_service);
 };
 
+// `metric_timer` is a timer class that runs metric-related computations periodically, such as
+// calculating percentile, checking if there are stale entities. It accepts `on_exec` and
+// `on_close` as the callbacks for execution and close.
+//
+// In case that all metrics (such as percentiles) are computed at the same time and lead to very
+// high load, first calculation will be delayed at a random interval.
+class metric_timer
+{
+public:
+    enum class state : int
+    {
+        kRunning,
+        kClosing,
+        kClosed,
+    };
+
+    using on_exec_fn = std::function<void()>;
+    using on_close_fn = std::function<void()>;
+
+    metric_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close);
+    ~metric_timer() = default;
+
+    void close();
+    void wait();
+
+    // Get the initial delay that is randomly generated by `generate_initial_delay_ms()`.
+    uint64_t get_initial_delay_ms() const { return _initial_delay_ms; }
+
+private:
+    // Generate an initial delay randomly in case that all percentiles are computed at the
+    // same time.
+    static uint64_t generate_initial_delay_ms(uint64_t interval_ms);
+
+    void on_close();
+
+    void on_timer(const boost::system::error_code &ec);
+
+    const uint64_t _initial_delay_ms;
+    const uint64_t _interval_ms;
+    const on_exec_fn _on_exec;
+    const on_close_fn _on_close;
+    std::atomic<state> _state;
+    utils::notify_event _completed;
+    std::unique_ptr<boost::asio::deadline_timer> _timer;
+
+    DISALLOW_COPY_AND_ASSIGN(metric_timer);
+};
+
 class metric_registry : public utils::singleton<metric_registry>
 {
 public:
     using entity_map = std::unordered_map<std::string, metric_entity_ptr>;
+    using collected_entity_list = std::unordered_set<std::string>;
+
+    struct collected_entities_info
+    {
+        // The collected entities that will be processed by retire_stale_entities(). Following
+        // kinds of entities will be collected:
+        // * entities that should be retired immediately. The entities that are still within
+        // the retention interval will not be collected.
+        // * entities that were previously considered stale however have already been reemployed,
+        // which means its retirement should be cancelled by retire_stale_entities().
+        collected_entity_list collected_entities;
+
+        // The number of all entities in the registry.
+        size_t num_all_entities = 0;
+
+        // The number of the entities that have been scheduled to be retired.
+        size_t num_scheduled_entities = 0;
+
+        collected_entities_info() = default;
+    };
+
+    struct retired_entities_stat
+    {
+        // The number of retired entities.
+        size_t num_retired_entities = 0;
+
+        // The number of entities that were recently considered stale and scheduled to be
+        // retired.
+        size_t num_recently_scheduled_entities = 0;
+
+        // The number of the entities that had previously been scheduled to be retired and
+        // were recently reemployed.
+        size_t num_reemployed_entities = 0;
+
+        retired_entities_stat() = default;
+    };
 
     entity_map entities() const;
 
@@ -368,19 +471,51 @@ private:
     friend class utils::singleton<metric_registry>;
 
     friend void test_get_metrics_handler(const http_request &req, http_response &resp);
+    friend class scoped_entity;
+    friend class MetricsRetirementTest;
 
     metric_registry();
     ~metric_registry();
 
+    void on_close();
+
+    void start_timer();
+    void stop_timer();
+
     metric_entity_ptr find_or_create_entity(const metric_entity_prototype *prototype,
                                             const std::string &id,
                                             const metric_entity::attr_map &attrs);
 
+    // These functions are used to retire stale entities.
+    //
+    // Since retirement is infrequent, there tend to be no entity that should be retired.
+    // Therefore, the whole retirement process is divided into two phases: "collect" and
+    // "retire".
+    //
+    // At the first phase "collect", we just check if there are entities that:
+    // * has become stale, but has not been scheduled to be retired, or
+    // * should be retired immediately, or
+    // * previously were scheduled to be retired, now has been reemployed.
+    //
+    // All operations in the first phase are read-only, needing just read lock which is more
+    // lightweight. If some entities were found following above conditions, albeit infrequenly,
+    // they would be collected to be processed at the next phase.
+    //
+    // Collected entities, if any, will be processed at the second phase "retire":
+    // * stale entities will be schedule to be retired;
+    // * the expired entities will be retired;
+    // * reset the retirement timestamp to 0 for reemployed entities.
+    collected_entities_info collect_stale_entities() const;
+    retired_entities_stat retire_stale_entities(const collected_entity_list &collected_entities);
+    void process_stale_entities();
+
     mutable utils::rw_lock_nr _lock;
     entity_map _entities;
 
     metrics_http_service _http_service;
 
+    std::unique_ptr<metric_timer> _timer;
+
     DISALLOW_COPY_AND_ASSIGN(metric_registry);
 };
 
@@ -593,6 +728,8 @@ protected:
     const metric_prototype *const _prototype;
 
 private:
+    friend class metric_entity;
+
     DISALLOW_COPY_AND_ASSIGN(metric);
 };
 
@@ -892,56 +1029,6 @@ inline size_t kth_percentile_to_nth_index(size_t size, kth_percentile_type type)
     return kth_percentile_to_nth_index(size, static_cast<size_t>(type));
 }
 
-// `percentile_timer` is a timer class that encapsulates the details how each percentile is
-// computed periodically.
-//
-// To be instantiated, it requires `interval_ms` at which a percentile is computed and `exec`
-// which is used to compute percentile.
-//
-// In case that all percentiles are computed at the same time and lead to very high load,
-// first computation for percentile will be delayed at a random interval.
-class percentile_timer
-{
-public:
-    enum class state : int
-    {
-        kRunning,
-        kClosing,
-        kClosed,
-    };
-
-    using on_exec_fn = std::function<void()>;
-    using on_close_fn = std::function<void()>;
-
-    percentile_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close);
-    ~percentile_timer() = default;
-
-    void close();
-    void wait();
-
-    // Get the initial delay that is randomly generated by `generate_initial_delay_ms()`.
-    uint64_t get_initial_delay_ms() const { return _initial_delay_ms; }
-
-private:
-    // Generate an initial delay randomly in case that all percentiles are computed at the
-    // same time.
-    static uint64_t generate_initial_delay_ms(uint64_t interval_ms);
-
-    void on_close();
-
-    void on_timer(const boost::system::error_code &ec);
-
-    const uint64_t _initial_delay_ms;
-    const uint64_t _interval_ms;
-    const on_exec_fn _on_exec;
-    const on_close_fn _on_close;
-    std::atomic<state> _state;
-    utils::notify_event _completed;
-    std::unique_ptr<boost::asio::deadline_timer> _timer;
-
-    DISALLOW_COPY_AND_ASSIGN(percentile_timer);
-};
-
 // The percentile is a metric type that samples observations. The size of samples has an upper
 // bound. Once the maximum size is reached, the earliest observations will be overwritten.
 //
@@ -1022,6 +1109,8 @@ protected:
     // interval_ms is the interval between the computations for percentiles. Its unit is
     // milliseconds. It's suggested that interval_ms should be near the period between pulls
     // from or pushes to the monitoring system.
+    // TODO(wangdan): we can also support constructing percentiles from the parameters in
+    // the configuration file.
     percentile(const metric_prototype *prototype,
                uint64_t interval_ms = 10000,
                const std::set<kth_percentile_type> &kth_percentiles = kAllKthPercentileTypes,
@@ -1066,7 +1155,7 @@ protected:
         // See on_close() for details which is registered in timer and will be called
         // back once close() is invoked.
         add_ref();
-        _timer.reset(new percentile_timer(
+        _timer.reset(new metric_timer(
             interval_ms,
             std::bind(&percentile<value_type, NthElementFinder>::find_nth_elements, this),
             std::bind(&percentile<value_type, NthElementFinder>::on_close, this)));
@@ -1159,7 +1248,7 @@ private:
     std::vector<std::atomic<value_type>> _full_nth_elements;
     NthElementFinder _nth_element_finder;
 
-    std::unique_ptr<percentile_timer> _timer;
+    std::unique_ptr<metric_timer> _timer;
 
     DISALLOW_COPY_AND_ASSIGN(percentile);
 };
diff --git a/src/utils/test/metrics_test.cpp b/src/utils/test/metrics_test.cpp
index 9b12a61a6..20414a274 100644
--- a/src/utils/test/metrics_test.cpp
+++ b/src/utils/test/metrics_test.cpp
@@ -19,6 +19,7 @@
 
 #include <chrono>
 #include <thread>
+#include <tuple>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -31,6 +32,8 @@
 
 namespace dsn {
 
+DSN_DECLARE_uint64(entity_retirement_delay_ms);
+
 class my_gauge : public metric
 {
 public:
@@ -2828,4 +2831,216 @@ TEST(metrics_test, http_get_metrics)
     }
 }
 
+using surviving_metrics_case = std::tuple<std::string, bool, bool, bool, bool>;
+
+class MetricsRetirementTest : public testing::TestWithParam<surviving_metrics_case>
+{
+public:
+    // For higher version of googletest, use `static void SetUpTestSuite()` instead.
+    static void SetUpTestCase()
+    {
+        // Restart the timer of registry with shorter interval to reduce the test time.
+        _reserved_entity_retirement_delay_ms = FLAGS_entity_retirement_delay_ms;
+        restart_metric_registry_timer(kEntityRetirementDelayMsForTest);
+    }
+
+    // For higher version of googletest, use `static void TearDownTestSuite()` instead.
+    static void TearDownTestCase()
+    {
+        // Recover the timer of registry with the original interval.
+        restart_metric_registry_timer(_reserved_entity_retirement_delay_ms);
+    }
+
+    static const uint64_t kEntityRetirementDelayMsForTest;
+
+private:
+    static void restart_metric_registry_timer(uint64_t interval_ms)
+    {
+        metric_registry::instance().stop_timer();
+        FLAGS_entity_retirement_delay_ms = interval_ms;
+        metric_registry::instance().start_timer();
+
+        std::cout << "restart the timer of metric registry at interval " << interval_ms << " ms."
+                  << std::endl;
+    }
+
+    static uint64_t _reserved_entity_retirement_delay_ms;
+};
+
+const uint64_t MetricsRetirementTest::kEntityRetirementDelayMsForTest = 100;
+uint64_t MetricsRetirementTest::_reserved_entity_retirement_delay_ms;
+
+// This class helps to test retirement of metrics and entities, by creating temporary
+// variables or reference them as members of this class to control their lifetime.
+class scoped_entity
+{
+public:
+    // Use the raw pointer to hold metric without any reference which may affect the test results.
+    using surviving_metric_map = std::unordered_map<const metric_prototype *, const metric *>;
+
+    scoped_entity(const std::string &entity_id,
+                  bool is_entity_surviving,
+                  bool is_gauge_surviving,
+                  bool is_counter_surviving,
+                  bool is_percentile_surviving);
+
+    // After a long enough time, check if temporary entity is retired with its own metrics while
+    // long-life one still survive.
+    void test_survival_after_retirement() const;
+
+private:
+    template <typename MetricPrototype, typename MetricPtr>
+    void instantiate_metric(const metric_entity_ptr &my_entity,
+                            bool is_surviving,
+                            const MetricPrototype &prototype,
+                            MetricPtr &m)
+    {
+        // Create a temporary variable for the metric.
+        auto temp_m = prototype.instantiate(my_entity);
+        _expected_all_metrics.emplace(&prototype, temp_m.get());
+
+        if (!is_surviving) {
+            return;
+        }
+
+        // Extend the lifetime of the metric since it's marked as "surviving".
+        m = temp_m;
+    }
+
+    surviving_metric_map get_actual_surviving_metrics(const metric_entity_ptr &my_entity) const;
+
+    // Check if the entity still survive with its own metrics no matter whether they are temporary
+    // or long-life.
+    void test_survival_immediately_after_initialization() const;
+
+    std::string _my_entity_id;
+    metric_entity *_expected_my_entity_raw_ptr;
+    metric_entity_ptr _my_entity;
+
+    gauge_ptr<int64_t> _my_gauge_int64;
+    counter_ptr<> _my_counter;
+    percentile_ptr<int64_t> _my_percentile_int64;
+
+    surviving_metric_map _expected_all_metrics;
+    surviving_metric_map _expected_surviving_metrics;
+};
+
+scoped_entity::scoped_entity(const std::string &entity_id,
+                             bool is_entity_surviving,
+                             bool is_gauge_surviving,
+                             bool is_counter_surviving,
+                             bool is_percentile_surviving)
+    : _my_entity_id(entity_id)
+{
+    // Create a temporary variabl for the entity.
+    auto my_entity = METRIC_ENTITY_my_server.instantiate(entity_id);
+    _expected_my_entity_raw_ptr = my_entity.get();
+
+    // Create temporary or long-life variables for metrics, depending on what is_*_surviving is.
+    instantiate_metric(
+        my_entity, is_gauge_surviving, METRIC_test_server_gauge_int64, _my_gauge_int64);
+    instantiate_metric(my_entity, is_counter_surviving, METRIC_test_server_counter, _my_counter);
+    instantiate_metric(my_entity,
+                       is_percentile_surviving,
+                       METRIC_test_server_percentile_int64,
+                       _my_percentile_int64);
+
+    if (is_entity_surviving) {
+        // Extend the lifetime of the entity since it's marked as "surviving".
+        _my_entity = my_entity;
+        _expected_surviving_metrics = _expected_all_metrics;
+    }
+
+    test_survival_immediately_after_initialization();
+}
+
+scoped_entity::surviving_metric_map
+scoped_entity::get_actual_surviving_metrics(const metric_entity_ptr &my_entity) const
+{
+    surviving_metric_map actual_surviving_metrics;
+
+    utils::auto_read_lock l(my_entity->_lock);
+
+    // Use internal member directly instead of calling metrics(). We don't want to have
+    // any reference which may affect the test results.
+    for (const auto &m : my_entity->_metrics) {
+        actual_surviving_metrics.emplace(m.first, m.second.get());
+    }
+
+    return actual_surviving_metrics;
+}
+
+void scoped_entity::test_survival_immediately_after_initialization() const
+{
+    utils::auto_read_lock l(metric_registry::instance()._lock);
+
+    // Use internal member directly instead of calling entities(). We don't want to have
+    // any reference which may affect the test results.
+    const auto &entities = metric_registry::instance()._entities;
+    const auto &iter = entities.find(_my_entity_id);
+    ASSERT_NE(entities.end(), iter);
+    ASSERT_EQ(_expected_my_entity_raw_ptr, iter->second.get());
+
+    const auto &actual_surviving_metrics = get_actual_surviving_metrics(iter->second);
+    ASSERT_EQ(_expected_all_metrics, actual_surviving_metrics);
+}
+
+void scoped_entity::test_survival_after_retirement() const
+{
+    std::this_thread::sleep_for(
+        std::chrono::milliseconds(MetricsRetirementTest::kEntityRetirementDelayMsForTest * 2));
+
+    utils::auto_read_lock l(metric_registry::instance()._lock);
+
+    // Use internal member directly instead of calling entities(). We don't want to have
+    // any reference which may affect the test results.
+    const auto &entities = metric_registry::instance()._entities;
+    const auto &iter = entities.find(_my_entity_id);
+    if (_my_entity == nullptr) {
+        // The entity has been retired.
+        ASSERT_EQ(entities.end(), iter);
+        ASSERT_TRUE(_expected_surviving_metrics.empty());
+        return;
+    }
+
+    ASSERT_NE(entities.end(), iter);
+    ASSERT_EQ(_expected_my_entity_raw_ptr, iter->second.get());
+
+    const auto &actual_surviving_metrics = get_actual_surviving_metrics(iter->second);
+    ASSERT_EQ(_expected_surviving_metrics, actual_surviving_metrics);
+}
+
+TEST_P(MetricsRetirementTest, RetireOldMetrics)
+{
+    std::string entity_id;
+    bool is_entity_surviving;
+    bool is_gauge_surviving;
+    bool is_counter_surviving;
+    bool is_percentile_surviving;
+    std::tie(entity_id,
+             is_entity_surviving,
+             is_gauge_surviving,
+             is_counter_surviving,
+             is_percentile_surviving) = GetParam();
+
+    scoped_entity entity(entity_id,
+                         is_entity_surviving,
+                         is_gauge_surviving,
+                         is_counter_surviving,
+                         is_percentile_surviving);
+    entity.test_survival_after_retirement();
+}
+
+const std::vector<surviving_metrics_case> metrics_retirement_tests = {
+    {"server_117", true, true, true, true},
+    {"server_118", true, true, true, false},
+    {"server_119", true, true, false, false},
+    {"server_120", true, false, false, false},
+    {"server_121", false, false, false, false},
+};
+
+INSTANTIATE_TEST_CASE_P(MetricsTest,
+                        MetricsRetirementTest,
+                        testing::ValuesIn(metrics_retirement_tests));
+
 } // namespace dsn


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pegasus.apache.org
For additional commands, e-mail: commits-help@pegasus.apache.org