You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pegasus.apache.org by wa...@apache.org on 2023/02/07 07:45:26 UTC
[incubator-pegasus] branch master updated: feat(new_metrics): retire stale metric entities that are not used by any other object (#1304)
This is an automated email from the ASF dual-hosted git repository.
wangdan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pegasus.git
The following commit(s) were added to refs/heads/master by this push:
new 624cae5ac feat(new_metrics): retire stale metric entities that are not used by any other object (#1304)
624cae5ac is described below
commit 624cae5ac922dee96c20629d1bd35b0cc3b62e2f
Author: Dan Wang <wa...@apache.org>
AuthorDate: Tue Feb 7 15:45:20 2023 +0800
feat(new_metrics): retire stale metric entities that are not used by any other object (#1304)
---
src/utils/metrics.cpp | 191 +++++++++++++++++++++++++++++++----
src/utils/metrics.h | 193 ++++++++++++++++++++++++++----------
src/utils/test/metrics_test.cpp | 215 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 529 insertions(+), 70 deletions(-)
diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp
index 6e83324d6..007fb2198 100644
--- a/src/utils/metrics.cpp
+++ b/src/utils/metrics.cpp
@@ -17,7 +17,9 @@
#include "utils/metrics.h"
+#include "runtime/api_layer1.h"
#include "utils/api_utilities.h"
+#include "utils/flags.h"
#include "utils/rand.h"
#include "utils/shared_io_service.h"
#include "utils/string_conv.h"
@@ -25,10 +27,15 @@
namespace dsn {
+DSN_DEFINE_uint64(metrics,
+ entity_retirement_delay_ms,
+ 10 * 60 * 1000,
+ "The retention internal (milliseconds) for an entity after it becomes stale.");
+
metric_entity::metric_entity(const metric_entity_prototype *prototype,
const std::string &id,
const attr_map &attrs)
- : _prototype(prototype), _id(id), _attrs(attrs)
+ : _prototype(prototype), _id(id), _attrs(attrs), _retire_time_ms(0)
{
}
@@ -45,13 +52,10 @@ void metric_entity::close(close_option option)
{
utils::auto_write_lock l(_lock);
- // The reason why each metric is closed in the entity rather than in the destructor of each
- // metric is that close() for the metric will return immediately without waiting for any close
- // operation to be finished.
- //
- // Thus, to close all metrics owned by an entity, it's more efficient to firstly issue a close
- // request for all metrics; then, just wait for all of the close operations to be finished.
- // It's inefficient to wait for each metric to be closed one by one.
+ // To close all metrics owned by an entity, it's more efficient to firstly issue an asynchronous
+ // close request to each metric; then, just wait for all of the close operations to be finished.
+ // It's inefficient to wait for each metric to be closed one by one. Therefore, the metric is
+ // not closed in its destructor.
for (auto &m : _metrics) {
if (m.second->prototype()->type() == metric_type::kPercentile) {
auto p = down_cast<closeable_metric *>(m.second.get());
@@ -177,6 +181,17 @@ void metric_entity::take_snapshot(metric_json_writer &writer, const metric_filte
writer.EndObject();
}
+bool metric_entity::is_stale() const
+{
+ // Since this entity itself is still being accessed, its reference count should be 1
+ // at least.
+ CHECK_GE(get_count(), 1);
+
+ // This entity is considered stale once there is only one reference for it kept in the
+ // registry.
+ return get_count() == 1;
+}
+
void metric_filters::extract_entity_metrics(const metric_entity::metric_map &candidates,
metric_entity::metric_map &target_metrics) const
{
@@ -312,10 +327,12 @@ metric_registry::metric_registry() : _http_service(this)
{
// We should ensure that metric_registry is destructed before shared_io_service is destructed.
// Once shared_io_service is destructed before metric_registry is destructed,
- // boost::asio::io_service needed by metrics in metric_registry such as percentile_timer will
+ // boost::asio::io_service needed by metrics in metric_registry such as metric_timer will
// be released firstly, then will lead to heap-use-after-free error since percentiles in
// metric_registry are still running but the resources they needed have been released.
tools::shared_io_service::instance();
+
+ start_timer();
}
metric_registry::~metric_registry()
@@ -336,6 +353,39 @@ metric_registry::~metric_registry()
for (auto &entity : _entities) {
entity.second->close(metric_entity::close_option::kNoWait);
}
+
+ stop_timer();
+}
+
+void metric_registry::on_close() {}
+
+void metric_registry::start_timer()
+{
+ if (_timer) {
+ return;
+ }
+
+ // Once an entity is considered stale, it will be retired after the retention interval,
+ // namely FLAGS_entity_retirement_delay_ms milliseconds. Therefore, if the interval of
+ // the timer is also set to FLAGS_entity_retirement_delay_ms, in the next round, it's
+ // just about time to retire this entity.
+ _timer.reset(new metric_timer(FLAGS_entity_retirement_delay_ms,
+ std::bind(&metric_registry::process_stale_entities, this),
+ std::bind(&metric_registry::on_close, this)));
+}
+
+void metric_registry::stop_timer()
+{
+ if (!_timer) {
+ return;
+ }
+
+ // Close the timer synchronously.
+ _timer->close();
+ _timer->wait();
+
+ // Reset the timer to mark that it has been stopped, now it could be started.
+ _timer.reset();
}
metric_registry::entity_map metric_registry::entities() const
@@ -383,6 +433,111 @@ metric_entity_ptr metric_registry::find_or_create_entity(const metric_entity_pro
return entity;
}
+metric_registry::collected_entities_info metric_registry::collect_stale_entities() const
+{
+ collected_entities_info collected_info;
+
+ auto now = dsn_now_ms();
+
+ utils::auto_read_lock l(_lock);
+
+ for (const auto &entity : _entities) {
+ if (!entity.second->is_stale()) {
+ if (entity.second->_retire_time_ms > 0) {
+ // This entity had been scheduled to be retired. However, it was reemployed
+ // after that. It has been in use since then, therefore its scheduled time
+ // for retirement should be reset to 0.
+ collected_info.collected_entities.insert(entity.first);
+ }
+ continue;
+ }
+
+ if (entity.second->_retire_time_ms > now) {
+ // This entity has been scheduled to be retired, however it is still within
+ // the retention interval. Thus do not collect it.
+ ++collected_info.num_scheduled_entities;
+ continue;
+ }
+
+ collected_info.collected_entities.insert(entity.first);
+ }
+
+ collected_info.num_all_entities = _entities.size();
+ return collected_info;
+}
+
+metric_registry::retired_entities_stat
+metric_registry::retire_stale_entities(const collected_entity_list &collected_entities)
+{
+ if (collected_entities.empty()) {
+ // Do not lock for empty list.
+ return retired_entities_stat();
+ }
+
+ retired_entities_stat retired_stat;
+
+ auto now = dsn_now_ms();
+
+ utils::auto_write_lock l(_lock);
+
+ for (const auto &collected_entity : collected_entities) {
+ auto iter = _entities.find(collected_entity);
+ if (dsn_unlikely(iter == _entities.end())) {
+ // The entity has been removed from the registry for some unusual reason.
+ continue;
+ }
+
+ if (!iter->second->is_stale()) {
+ if (iter->second->_retire_time_ms > 0) {
+ // For those entities which are reemployed, their scheduled time for retirement
+ // should be reset to 0 though previously they could have been scheduled to be
+ // retired.
+ iter->second->_retire_time_ms = 0;
+ ++retired_stat.num_reemployed_entities;
+ }
+ continue;
+ }
+
+ if (dsn_unlikely(iter->second->_retire_time_ms > now)) {
+ // Since in collect_stale_entities() we've filtered the metrics which have been
+ // outside the retention interval, this is unlikely to happen. However, we still
+ // check here.
+ continue;
+ }
+
+ if (iter->second->_retire_time_ms == 0) {
+ // The entity should be marked with a scheduled time for retirement, since it has
+ // already been considered stale.
+ iter->second->_retire_time_ms = now + FLAGS_entity_retirement_delay_ms;
+ ++retired_stat.num_recently_scheduled_entities;
+ continue;
+ }
+
+ // Once the entity is outside the retention interval, retire it from the registry.
+ _entities.erase(iter);
+ ++retired_stat.num_retired_entities;
+ }
+
+ return retired_stat;
+}
+
+void metric_registry::process_stale_entities()
+{
+ LOG_INFO("begin to process stale metric entities");
+
+ const auto &collected_info = collect_stale_entities();
+ const auto &retired_stat = retire_stale_entities(collected_info.collected_entities);
+
+ LOG_INFO("stat for metric entities: total={}, collected={}, retired={}, scheduled={}, "
+ "recently_scheduled={}, reemployed={}",
+ collected_info.num_all_entities,
+ collected_info.collected_entities.size(),
+ retired_stat.num_retired_entities,
+ collected_info.num_scheduled_entities,
+ retired_stat.num_recently_scheduled_entities,
+ retired_stat.num_reemployed_entities);
+}
+
metric_prototype::metric_prototype(const ctor_args &args) : _args(args) {}
metric_prototype::~metric_prototype() {}
@@ -391,7 +546,7 @@ metric::metric(const metric_prototype *prototype) : _prototype(prototype) {}
closeable_metric::closeable_metric(const metric_prototype *prototype) : metric(prototype) {}
-uint64_t percentile_timer::generate_initial_delay_ms(uint64_t interval_ms)
+uint64_t metric_timer::generate_initial_delay_ms(uint64_t interval_ms)
{
CHECK_GT(interval_ms, 0);
@@ -403,7 +558,7 @@ uint64_t percentile_timer::generate_initial_delay_ms(uint64_t interval_ms)
return (rand::next_u64() % interval_seconds + 1) * 1000 + rand::next_u64() % 1000;
}
-percentile_timer::percentile_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close)
+metric_timer::metric_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close)
: _initial_delay_ms(generate_initial_delay_ms(interval_ms)),
_interval_ms(interval_ms),
_on_exec(on_exec),
@@ -413,10 +568,10 @@ percentile_timer::percentile_timer(uint64_t interval_ms, on_exec_fn on_exec, on_
_timer(new boost::asio::deadline_timer(tools::shared_io_service::instance().ios))
{
_timer->expires_from_now(boost::posix_time::milliseconds(_initial_delay_ms));
- _timer->async_wait(std::bind(&percentile_timer::on_timer, this, std::placeholders::_1));
+ _timer->async_wait(std::bind(&metric_timer::on_timer, this, std::placeholders::_1));
}
-void percentile_timer::close()
+void metric_timer::close()
{
// If the timer has already expired when cancel() is called, then the handlers for asynchronous
// wait operations will:
@@ -433,15 +588,15 @@ void percentile_timer::close()
}
}
-void percentile_timer::wait() { _completed.wait(); }
+void metric_timer::wait() { _completed.wait(); }
-void percentile_timer::on_close()
+void metric_timer::on_close()
{
_on_close();
_completed.notify();
}
-void percentile_timer::on_timer(const boost::system::error_code &ec)
+void metric_timer::on_timer(const boost::system::error_code &ec)
{
// This macro is defined for the case that handlers for asynchronous wait operations are no
// longer cancelled. It just checks the internal state atomically (since close() can also be
@@ -465,7 +620,7 @@ void percentile_timer::on_timer(const boost::system::error_code &ec)
// Cancel can only be launched by close().
auto expected_state = state::kClosing;
CHECK(_state.compare_exchange_strong(expected_state, state::kClosed),
- "wrong state for percentile_timer: {}, while expecting closing state",
+ "wrong state for metric_timer: {}, while expecting closing state",
static_cast<int>(expected_state));
on_close();
@@ -477,7 +632,7 @@ void percentile_timer::on_timer(const boost::system::error_code &ec)
TRY_PROCESS_TIMER_CLOSING();
_timer->expires_from_now(boost::posix_time::milliseconds(_interval_ms));
- _timer->async_wait(std::bind(&percentile_timer::on_timer, this, std::placeholders::_1));
+ _timer->async_wait(std::bind(&metric_timer::on_timer, this, std::placeholders::_1));
#undef TRY_PROCESS_TIMER_CLOSING
}
diff --git a/src/utils/metrics.h b/src/utils/metrics.h
index c7c1bda18..340d2fd9d 100644
--- a/src/utils/metrics.h
+++ b/src/utils/metrics.h
@@ -171,6 +171,7 @@ public:
private:
friend class metric_registry;
friend class ref_ptr<metric_entity>;
+ friend class scoped_entity;
metric_entity(const metric_entity_prototype *prototype,
const std::string &id,
@@ -197,6 +198,18 @@ private:
void encode_id(metric_json_writer &writer) const;
+ // Decide if an entity is stale. An entity becomes stale if it is no longer used by any other
+ // object.
+ //
+ // An entity could be bound to one or multiple objects. Once all of these objects are
+ // destroyed, this entity will become stale, which means all of the metrics held by this
+ // entity are also stale.
+ //
+ // For example, once a replica is removed, the replica entity (and all metrics it holds) will
+ // become stale; then, this entity is scheduled to be retired after a configurable retention
+ // interval; finally, this entity will be removed from the registry with all metrics it holds.
+ bool is_stale() const;
+
const metric_entity_prototype *const _prototype;
const std::string _id;
@@ -204,6 +217,12 @@ private:
attr_map _attrs;
metric_map _metrics;
+ // The timestamp when this entity should be retired:
+ // * default value is 0, which means this entity has not been scheduled to be retired;
+ // * otherwise, non-zero value means this entity has been scheduled to be retired, and will
+ // be retired at any time once current time has reached or exceeded this timestamp.
+ uint64_t _retire_time_ms;
+
DISALLOW_COPY_AND_ASSIGN(metric_entity);
};
@@ -354,10 +373,94 @@ private:
DISALLOW_COPY_AND_ASSIGN(metrics_http_service);
};
+// `metric_timer` is a timer class that runs metric-related computations periodically, such as
+// calculating percentile, checking if there are stale entities. It accepts `on_exec` and
+// `on_close` as the callbacks for execution and close.
+//
+// In case that all metrics (such as percentiles) are computed at the same time and lead to very
+// high load, first calculation will be delayed at a random interval.
+class metric_timer
+{
+public:
+ enum class state : int
+ {
+ kRunning,
+ kClosing,
+ kClosed,
+ };
+
+ using on_exec_fn = std::function<void()>;
+ using on_close_fn = std::function<void()>;
+
+ metric_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close);
+ ~metric_timer() = default;
+
+ void close();
+ void wait();
+
+ // Get the initial delay that is randomly generated by `generate_initial_delay_ms()`.
+ uint64_t get_initial_delay_ms() const { return _initial_delay_ms; }
+
+private:
+ // Generate an initial delay randomly in case that all percentiles are computed at the
+ // same time.
+ static uint64_t generate_initial_delay_ms(uint64_t interval_ms);
+
+ void on_close();
+
+ void on_timer(const boost::system::error_code &ec);
+
+ const uint64_t _initial_delay_ms;
+ const uint64_t _interval_ms;
+ const on_exec_fn _on_exec;
+ const on_close_fn _on_close;
+ std::atomic<state> _state;
+ utils::notify_event _completed;
+ std::unique_ptr<boost::asio::deadline_timer> _timer;
+
+ DISALLOW_COPY_AND_ASSIGN(metric_timer);
+};
+
class metric_registry : public utils::singleton<metric_registry>
{
public:
using entity_map = std::unordered_map<std::string, metric_entity_ptr>;
+ using collected_entity_list = std::unordered_set<std::string>;
+
+ struct collected_entities_info
+ {
+ // The collected entities that will be processed by retire_stale_entities(). Following
+ // kinds of entities will be collected:
+ // * entities that should be retired immediately. The entities that are still within
+ // the retention interval will not be collected.
+ // * entities that were previously considered stale however have already been reemployed,
+ // which means its retirement should be cancelled by retire_stale_entities().
+ collected_entity_list collected_entities;
+
+ // The number of all entities in the registry.
+ size_t num_all_entities = 0;
+
+ // The number of the entities that have been scheduled to be retired.
+ size_t num_scheduled_entities = 0;
+
+ collected_entities_info() = default;
+ };
+
+ struct retired_entities_stat
+ {
+ // The number of retired entities.
+ size_t num_retired_entities = 0;
+
+ // The number of entities that were recently considered stale and scheduled to be
+ // retired.
+ size_t num_recently_scheduled_entities = 0;
+
+ // The number of the entities that had previously been scheduled to be retired and
+ // were recently reemployed.
+ size_t num_reemployed_entities = 0;
+
+ retired_entities_stat() = default;
+ };
entity_map entities() const;
@@ -368,19 +471,51 @@ private:
friend class utils::singleton<metric_registry>;
friend void test_get_metrics_handler(const http_request &req, http_response &resp);
+ friend class scoped_entity;
+ friend class MetricsRetirementTest;
metric_registry();
~metric_registry();
+ void on_close();
+
+ void start_timer();
+ void stop_timer();
+
metric_entity_ptr find_or_create_entity(const metric_entity_prototype *prototype,
const std::string &id,
const metric_entity::attr_map &attrs);
+ // These functions are used to retire stale entities.
+ //
+ // Since retirement is infrequent, there tend to be no entity that should be retired.
+ // Therefore, the whole retirement process is divided into two phases: "collect" and
+ // "retire".
+ //
+ // At the first phase "collect", we just check if there are entities that:
+ // * has become stale, but has not been scheduled to be retired, or
+ // * should be retired immediately, or
+ // * previously were scheduled to be retired, now has been reemployed.
+ //
+ // All operations in the first phase are read-only, needing just read lock which is more
+ // lightweight. If some entities were found following above conditions, albeit infrequenly,
+ // they would be collected to be processed at the next phase.
+ //
+ // Collected entities, if any, will be processed at the second phase "retire":
+ // * stale entities will be schedule to be retired;
+ // * the expired entities will be retired;
+ // * reset the retirement timestamp to 0 for reemployed entities.
+ collected_entities_info collect_stale_entities() const;
+ retired_entities_stat retire_stale_entities(const collected_entity_list &collected_entities);
+ void process_stale_entities();
+
mutable utils::rw_lock_nr _lock;
entity_map _entities;
metrics_http_service _http_service;
+ std::unique_ptr<metric_timer> _timer;
+
DISALLOW_COPY_AND_ASSIGN(metric_registry);
};
@@ -593,6 +728,8 @@ protected:
const metric_prototype *const _prototype;
private:
+ friend class metric_entity;
+
DISALLOW_COPY_AND_ASSIGN(metric);
};
@@ -892,56 +1029,6 @@ inline size_t kth_percentile_to_nth_index(size_t size, kth_percentile_type type)
return kth_percentile_to_nth_index(size, static_cast<size_t>(type));
}
-// `percentile_timer` is a timer class that encapsulates the details how each percentile is
-// computed periodically.
-//
-// To be instantiated, it requires `interval_ms` at which a percentile is computed and `exec`
-// which is used to compute percentile.
-//
-// In case that all percentiles are computed at the same time and lead to very high load,
-// first computation for percentile will be delayed at a random interval.
-class percentile_timer
-{
-public:
- enum class state : int
- {
- kRunning,
- kClosing,
- kClosed,
- };
-
- using on_exec_fn = std::function<void()>;
- using on_close_fn = std::function<void()>;
-
- percentile_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close);
- ~percentile_timer() = default;
-
- void close();
- void wait();
-
- // Get the initial delay that is randomly generated by `generate_initial_delay_ms()`.
- uint64_t get_initial_delay_ms() const { return _initial_delay_ms; }
-
-private:
- // Generate an initial delay randomly in case that all percentiles are computed at the
- // same time.
- static uint64_t generate_initial_delay_ms(uint64_t interval_ms);
-
- void on_close();
-
- void on_timer(const boost::system::error_code &ec);
-
- const uint64_t _initial_delay_ms;
- const uint64_t _interval_ms;
- const on_exec_fn _on_exec;
- const on_close_fn _on_close;
- std::atomic<state> _state;
- utils::notify_event _completed;
- std::unique_ptr<boost::asio::deadline_timer> _timer;
-
- DISALLOW_COPY_AND_ASSIGN(percentile_timer);
-};
-
// The percentile is a metric type that samples observations. The size of samples has an upper
// bound. Once the maximum size is reached, the earliest observations will be overwritten.
//
@@ -1022,6 +1109,8 @@ protected:
// interval_ms is the interval between the computations for percentiles. Its unit is
// milliseconds. It's suggested that interval_ms should be near the period between pulls
// from or pushes to the monitoring system.
+ // TODO(wangdan): we can also support constructing percentiles from the parameters in
+ // the configuration file.
percentile(const metric_prototype *prototype,
uint64_t interval_ms = 10000,
const std::set<kth_percentile_type> &kth_percentiles = kAllKthPercentileTypes,
@@ -1066,7 +1155,7 @@ protected:
// See on_close() for details which is registered in timer and will be called
// back once close() is invoked.
add_ref();
- _timer.reset(new percentile_timer(
+ _timer.reset(new metric_timer(
interval_ms,
std::bind(&percentile<value_type, NthElementFinder>::find_nth_elements, this),
std::bind(&percentile<value_type, NthElementFinder>::on_close, this)));
@@ -1159,7 +1248,7 @@ private:
std::vector<std::atomic<value_type>> _full_nth_elements;
NthElementFinder _nth_element_finder;
- std::unique_ptr<percentile_timer> _timer;
+ std::unique_ptr<metric_timer> _timer;
DISALLOW_COPY_AND_ASSIGN(percentile);
};
diff --git a/src/utils/test/metrics_test.cpp b/src/utils/test/metrics_test.cpp
index 9b12a61a6..20414a274 100644
--- a/src/utils/test/metrics_test.cpp
+++ b/src/utils/test/metrics_test.cpp
@@ -19,6 +19,7 @@
#include <chrono>
#include <thread>
+#include <tuple>
#include <vector>
#include <gtest/gtest.h>
@@ -31,6 +32,8 @@
namespace dsn {
+DSN_DECLARE_uint64(entity_retirement_delay_ms);
+
class my_gauge : public metric
{
public:
@@ -2828,4 +2831,216 @@ TEST(metrics_test, http_get_metrics)
}
}
+using surviving_metrics_case = std::tuple<std::string, bool, bool, bool, bool>;
+
+class MetricsRetirementTest : public testing::TestWithParam<surviving_metrics_case>
+{
+public:
+ // For higher version of googletest, use `static void SetUpTestSuite()` instead.
+ static void SetUpTestCase()
+ {
+ // Restart the timer of registry with shorter interval to reduce the test time.
+ _reserved_entity_retirement_delay_ms = FLAGS_entity_retirement_delay_ms;
+ restart_metric_registry_timer(kEntityRetirementDelayMsForTest);
+ }
+
+ // For higher version of googletest, use `static void TearDownTestSuite()` instead.
+ static void TearDownTestCase()
+ {
+ // Recover the timer of registry with the original interval.
+ restart_metric_registry_timer(_reserved_entity_retirement_delay_ms);
+ }
+
+ static const uint64_t kEntityRetirementDelayMsForTest;
+
+private:
+ static void restart_metric_registry_timer(uint64_t interval_ms)
+ {
+ metric_registry::instance().stop_timer();
+ FLAGS_entity_retirement_delay_ms = interval_ms;
+ metric_registry::instance().start_timer();
+
+ std::cout << "restart the timer of metric registry at interval " << interval_ms << " ms."
+ << std::endl;
+ }
+
+ static uint64_t _reserved_entity_retirement_delay_ms;
+};
+
+const uint64_t MetricsRetirementTest::kEntityRetirementDelayMsForTest = 100;
+uint64_t MetricsRetirementTest::_reserved_entity_retirement_delay_ms;
+
+// This class helps to test retirement of metrics and entities, by creating temporary
+// variables or reference them as members of this class to control their lifetime.
+class scoped_entity
+{
+public:
+ // Use the raw pointer to hold metric without any reference which may affect the test results.
+ using surviving_metric_map = std::unordered_map<const metric_prototype *, const metric *>;
+
+ scoped_entity(const std::string &entity_id,
+ bool is_entity_surviving,
+ bool is_gauge_surviving,
+ bool is_counter_surviving,
+ bool is_percentile_surviving);
+
+ // After a long enough time, check if temporary entity is retired with its own metrics while
+ // long-life one still survive.
+ void test_survival_after_retirement() const;
+
+private:
+ template <typename MetricPrototype, typename MetricPtr>
+ void instantiate_metric(const metric_entity_ptr &my_entity,
+ bool is_surviving,
+ const MetricPrototype &prototype,
+ MetricPtr &m)
+ {
+ // Create a temporary variable for the metric.
+ auto temp_m = prototype.instantiate(my_entity);
+ _expected_all_metrics.emplace(&prototype, temp_m.get());
+
+ if (!is_surviving) {
+ return;
+ }
+
+ // Extend the lifetime of the metric since it's marked as "surviving".
+ m = temp_m;
+ }
+
+ surviving_metric_map get_actual_surviving_metrics(const metric_entity_ptr &my_entity) const;
+
+ // Check if the entity still survive with its own metrics no matter whether they are temporary
+ // or long-life.
+ void test_survival_immediately_after_initialization() const;
+
+ std::string _my_entity_id;
+ metric_entity *_expected_my_entity_raw_ptr;
+ metric_entity_ptr _my_entity;
+
+ gauge_ptr<int64_t> _my_gauge_int64;
+ counter_ptr<> _my_counter;
+ percentile_ptr<int64_t> _my_percentile_int64;
+
+ surviving_metric_map _expected_all_metrics;
+ surviving_metric_map _expected_surviving_metrics;
+};
+
+scoped_entity::scoped_entity(const std::string &entity_id,
+ bool is_entity_surviving,
+ bool is_gauge_surviving,
+ bool is_counter_surviving,
+ bool is_percentile_surviving)
+ : _my_entity_id(entity_id)
+{
+ // Create a temporary variabl for the entity.
+ auto my_entity = METRIC_ENTITY_my_server.instantiate(entity_id);
+ _expected_my_entity_raw_ptr = my_entity.get();
+
+ // Create temporary or long-life variables for metrics, depending on what is_*_surviving is.
+ instantiate_metric(
+ my_entity, is_gauge_surviving, METRIC_test_server_gauge_int64, _my_gauge_int64);
+ instantiate_metric(my_entity, is_counter_surviving, METRIC_test_server_counter, _my_counter);
+ instantiate_metric(my_entity,
+ is_percentile_surviving,
+ METRIC_test_server_percentile_int64,
+ _my_percentile_int64);
+
+ if (is_entity_surviving) {
+ // Extend the lifetime of the entity since it's marked as "surviving".
+ _my_entity = my_entity;
+ _expected_surviving_metrics = _expected_all_metrics;
+ }
+
+ test_survival_immediately_after_initialization();
+}
+
+scoped_entity::surviving_metric_map
+scoped_entity::get_actual_surviving_metrics(const metric_entity_ptr &my_entity) const
+{
+ surviving_metric_map actual_surviving_metrics;
+
+ utils::auto_read_lock l(my_entity->_lock);
+
+ // Use internal member directly instead of calling metrics(). We don't want to have
+ // any reference which may affect the test results.
+ for (const auto &m : my_entity->_metrics) {
+ actual_surviving_metrics.emplace(m.first, m.second.get());
+ }
+
+ return actual_surviving_metrics;
+}
+
+void scoped_entity::test_survival_immediately_after_initialization() const
+{
+ utils::auto_read_lock l(metric_registry::instance()._lock);
+
+ // Use internal member directly instead of calling entities(). We don't want to have
+ // any reference which may affect the test results.
+ const auto &entities = metric_registry::instance()._entities;
+ const auto &iter = entities.find(_my_entity_id);
+ ASSERT_NE(entities.end(), iter);
+ ASSERT_EQ(_expected_my_entity_raw_ptr, iter->second.get());
+
+ const auto &actual_surviving_metrics = get_actual_surviving_metrics(iter->second);
+ ASSERT_EQ(_expected_all_metrics, actual_surviving_metrics);
+}
+
+void scoped_entity::test_survival_after_retirement() const
+{
+ std::this_thread::sleep_for(
+ std::chrono::milliseconds(MetricsRetirementTest::kEntityRetirementDelayMsForTest * 2));
+
+ utils::auto_read_lock l(metric_registry::instance()._lock);
+
+ // Use internal member directly instead of calling entities(). We don't want to have
+ // any reference which may affect the test results.
+ const auto &entities = metric_registry::instance()._entities;
+ const auto &iter = entities.find(_my_entity_id);
+ if (_my_entity == nullptr) {
+ // The entity has been retired.
+ ASSERT_EQ(entities.end(), iter);
+ ASSERT_TRUE(_expected_surviving_metrics.empty());
+ return;
+ }
+
+ ASSERT_NE(entities.end(), iter);
+ ASSERT_EQ(_expected_my_entity_raw_ptr, iter->second.get());
+
+ const auto &actual_surviving_metrics = get_actual_surviving_metrics(iter->second);
+ ASSERT_EQ(_expected_surviving_metrics, actual_surviving_metrics);
+}
+
+TEST_P(MetricsRetirementTest, RetireOldMetrics)
+{
+ std::string entity_id;
+ bool is_entity_surviving;
+ bool is_gauge_surviving;
+ bool is_counter_surviving;
+ bool is_percentile_surviving;
+ std::tie(entity_id,
+ is_entity_surviving,
+ is_gauge_surviving,
+ is_counter_surviving,
+ is_percentile_surviving) = GetParam();
+
+ scoped_entity entity(entity_id,
+ is_entity_surviving,
+ is_gauge_surviving,
+ is_counter_surviving,
+ is_percentile_surviving);
+ entity.test_survival_after_retirement();
+}
+
+const std::vector<surviving_metrics_case> metrics_retirement_tests = {
+ {"server_117", true, true, true, true},
+ {"server_118", true, true, true, false},
+ {"server_119", true, true, false, false},
+ {"server_120", true, false, false, false},
+ {"server_121", false, false, false, false},
+};
+
+INSTANTIATE_TEST_CASE_P(MetricsTest,
+ MetricsRetirementTest,
+ testing::ValuesIn(metrics_retirement_tests));
+
} // namespace dsn
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pegasus.apache.org
For additional commands, e-mail: commits-help@pegasus.apache.org