You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pegasus.apache.org by "acelyc111 (via GitHub)" <gi...@apache.org> on 2023/02/06 11:26:09 UTC

[GitHub] [incubator-pegasus] acelyc111 commented on a diff in pull request #1304: feat(new_metrics): retire stale metric entities that are not used by any other object

acelyc111 commented on code in PR #1304:
URL: https://github.com/apache/incubator-pegasus/pull/1304#discussion_r1097111121


##########
src/utils/metrics.cpp:
##########
@@ -383,6 +434,111 @@ metric_entity_ptr metric_registry::find_or_create_entity(const metric_entity_pro
     return entity;
 }
 
+metric_registry::collected_stale_entities_info metric_registry::collect_stale_entities() const
+{
+    collected_stale_entities_info collected_info;
+
+    auto now = dsn_now_ms();
+
+    utils::auto_read_lock l(_lock);
+
+    for (const auto &entity : _entities) {
+        if (!entity.second->is_stale()) {
+            if (entity.second->_retire_time_ms > 0) {
+                // This entity had been scheduled to be retired. However, it was reemployed
+                // after that. It has been in use since then, therefore its scheduled time
+                // for retirement should be reset to 0.
+                collected_info.stale_entities.insert(entity.first);

Review Comment:
   How about renaming `stale_entities` because this kind of entity is not "stale" ?



##########
src/utils/metrics.cpp:
##########
@@ -41,17 +48,15 @@ metric_entity::~metric_entity()
     close(close_option::kWait);
 }
 
-void metric_entity::close(close_option option)
+void metric_entity::close(close_option option) const
 {
     utils::auto_write_lock l(_lock);
 
-    // The reason why each metric is closed in the entity rather than in the destructor of each
-    // metric is that close() for the metric will return immediately without waiting for any close
-    // operation to be finished.
-    //
-    // Thus, to close all metrics owned by an entity, it's more efficient to firstly issue a close
-    // request for all metrics; then, just wait for all of the close operations to be finished.
-    // It's inefficient to wait for each metric to be closed one by one.
+    // To close all metrics owned by an entity, it's more efficient to firstly issue an asynchronous
+    // close request to each metric; then, just wait for all of the close operations to be finished.
+    // It's inefficient to wait for each metric to be closed one by one. Therefore, the metric is
+    // not
+    // closed in its destructor.

Review Comment:
   ```suggestion
       // not closed in its destructor.
   ```



##########
src/utils/metrics.cpp:
##########
@@ -41,17 +48,15 @@ metric_entity::~metric_entity()
     close(close_option::kWait);
 }
 
-void metric_entity::close(close_option option)
+void metric_entity::close(close_option option) const
 {
     utils::auto_write_lock l(_lock);
 
-    // The reason why each metric is closed in the entity rather than in the destructor of each
-    // metric is that close() for the metric will return immediately without waiting for any close
-    // operation to be finished.
-    //
-    // Thus, to close all metrics owned by an entity, it's more efficient to firstly issue a close
-    // request for all metrics; then, just wait for all of the close operations to be finished.
-    // It's inefficient to wait for each metric to be closed one by one.
+    // To close all metrics owned by an entity, it's more efficient to firstly issue an asynchronous
+    // close request to each metric; then, just wait for all of the close operations to be finished.
+    // It's inefficient to wait for each metric to be closed one by one. Therefore, the metric is
+    // not
+    // closed in its destructor.
     for (auto &m : _metrics) {

Review Comment:
   Now that this is a `const` function, can we loop with const reference ? But we will call  `p->close()`, is it really constant?



##########
src/utils/metrics.cpp:
##########
@@ -383,6 +434,111 @@ metric_entity_ptr metric_registry::find_or_create_entity(const metric_entity_pro
     return entity;
 }
 
+metric_registry::collected_stale_entities_info metric_registry::collect_stale_entities() const
+{
+    collected_stale_entities_info collected_info;
+
+    auto now = dsn_now_ms();
+
+    utils::auto_read_lock l(_lock);
+
+    for (const auto &entity : _entities) {
+        if (!entity.second->is_stale()) {
+            if (entity.second->_retire_time_ms > 0) {
+                // This entity had been scheduled to be retired. However, it was reemployed
+                // after that. It has been in use since then, therefore its scheduled time
+                // for retirement should be reset to 0.
+                collected_info.stale_entities.insert(entity.first);
+            }
+            continue;
+        }
+
+        if (entity.second->_retire_time_ms > now) {
+            // This entity has been scheduled to be retired, however it is still within
+            // the retention interval. Thus do not collect it.
+            ++collected_info.num_scheduled_entities;
+            continue;
+        }
+
+        collected_info.stale_entities.insert(entity.first);
+    }
+
+    collected_info.num_all_entities = _entities.size();
+    return collected_info;
+}
+
+metric_registry::retired_entities_stat
+metric_registry::retire_stale_entities(const stale_entity_list &stale_entities)
+{
+    if (stale_entities.empty()) {
+        // Do not lock for empty list.
+        return retired_entities_stat();
+    }
+
+    retired_entities_stat retired_stat;
+
+    auto now = dsn_now_ms();
+
+    utils::auto_write_lock l(_lock);
+
+    for (const auto &stale_entity : stale_entities) {
+        auto iter = _entities.find(stale_entity);
+        if (dsn_unlikely(iter == _entities.end())) {
+            // The entity has been removed from the registry for some unusual reason.
+            continue;
+        }
+
+        if (!iter->second->is_stale()) {
+            if (iter->second->_retire_time_ms > 0) {
+                // For those entities which are reemployed, their scheduled time for retirement
+                // should be reset to 0 though previously they could have been scheduled to be
+                // retired.
+                iter->second->_retire_time_ms = 0;
+                ++retired_stat.num_reemployed_entities;
+            }
+            continue;
+        }
+
+        if (dsn_unlikely(iter->second->_retire_time_ms > now)) {
+            // Since in collect_stale_entities() we've filtered the metrics which have been
+            // outside the retention interval, this is unlikely to happen. However, we still
+            // check here.
+            continue;
+        }
+
+        if (iter->second->_retire_time_ms == 0) {
+            // The entity should be marked with a scheduled time for retirement, since it has
+            // already been considered stale.
+            iter->second->_retire_time_ms = now + FLAGS_entity_retirement_delay_ms;
+            ++retired_stat.num_recently_scheduled_entities;
+            continue;
+        }
+
+        // Once the entity is outside the retention interval, retire it from the registry.
+        _entities.erase(iter);
+        ++retired_stat.num_retired_entities;
+    }
+
+    return retired_stat;
+}
+
+void metric_registry::process_stale_entities()
+{
+    LOG_INFO("begin to process stale metric entities");
+
+    const auto &collected_info = collect_stale_entities();
+    const auto &retired_stat = retire_stale_entities(collected_info.stale_entities);
+
+    LOG_INFO("stat for metric entities: total={}, collected={}, retired={}, scheduled={}, "
+             "recently_scheduled={}, reemployed={}",
+             collected_info.num_all_entities,
+             collected_info.stale_entities.size(),

Review Comment:
   So, how about renaming `stale_entities` to `collected_entities` ?



##########
src/utils/test/metrics_test.cpp:
##########
@@ -2828,4 +2831,216 @@ TEST(metrics_test, http_get_metrics)
     }
 }
 
+using surviving_metrics_case = std::tuple<std::string, bool, bool, bool, bool>;
+
+class MetricsRetirementTest : public testing::TestWithParam<surviving_metrics_case>

Review Comment:
   👍 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscribe@pegasus.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@pegasus.apache.org
For additional commands, e-mail: dev-help@pegasus.apache.org