You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pegasus.apache.org by wa...@apache.org on 2023/05/25 10:55:25 UTC

[incubator-pegasus] 26/28: feat(new_metrics): migrate metrics for replica_stub (part 7) (#1475)

This is an automated email from the ASF dual-hosted git repository.

wangdan pushed a commit to branch migrate-metrics-dev
in repository https://gitbox.apache.org/repos/asf/incubator-pegasus.git

commit b3e8f0688ecb8d2a496864fed06a7486db5b90c7
Author: Dan Wang <wa...@apache.org>
AuthorDate: Wed May 17 16:44:21 2023 +0800

    feat(new_metrics): migrate metrics for replica_stub (part 7) (#1475)
    
    https://github.com/apache/incubator-pegasus/issues/1454
    
    This is the 7th part of migrating metrics of replica_stub to new framework,
    all of which are partition-splitting-related. This is also the last part for
    replica_stub.
    
    During this migration, there are 6 metrics which are changed from server-level
    to replica-level, including the number of started splittings, the number and
    the size of files copied for splitting, the number of mutations copied for splitting,
    the number of failed/successful splittings.
    
    Another 4 metrics are still kept server-level, including the number of current
    splitting replicas, the max duration among all splitting replicas, the max duration
    among all splitting replicas for async learns, the max size of copied files among
    all splitting replicas.
---
 src/replica/replica_stub.cpp                | 93 ++++++++++-------------------
 src/replica/replica_stub.h                  | 18 ++----
 src/replica/split/replica_split_manager.cpp | 55 +++++++++++++----
 src/replica/split/replica_split_manager.h   |  8 +++
 src/replica/test/replica_test.cpp           |  1 -
 src/utils/metrics.h                         |  1 +
 6 files changed, 88 insertions(+), 88 deletions(-)

diff --git a/src/replica/replica_stub.cpp b/src/replica/replica_stub.cpp
index 89479f288..834663629 100644
--- a/src/replica/replica_stub.cpp
+++ b/src/replica/replica_stub.cpp
@@ -62,7 +62,6 @@
 #include "mutation_log.h"
 #include "nfs/nfs_node.h"
 #include "nfs_types.h"
-#include "perf_counter/perf_counter.h"
 #include "replica.h"
 #include "replica/duplication/replica_follower.h"
 #include "replica/log_file.h"
@@ -205,6 +204,26 @@ METRIC_DEFINE_gauge_int64(server,
                           dsn::metric_unit::kMilliSeconds,
                           "The max duration of bulk loads");
 
+METRIC_DEFINE_gauge_int64(server,
+                          splitting_replicas,
+                          dsn::metric_unit::kReplicas,
+                          "The number of current splitting replicas");
+
+METRIC_DEFINE_gauge_int64(server,
+                          splitting_replicas_max_duration_ms,
+                          dsn::metric_unit::kMilliSeconds,
+                          "The max duration among all splitting replicas");
+
+METRIC_DEFINE_gauge_int64(server,
+                          splitting_replicas_async_learn_max_duration_ms,
+                          dsn::metric_unit::kMilliSeconds,
+                          "The max duration among all splitting replicas for async learns");
+
+METRIC_DEFINE_gauge_int64(server,
+                          splitting_replicas_max_copy_file_bytes,
+                          dsn::metric_unit::kBytes,
+                          "The max size of copied files among all splitting replicas");
+
 namespace dsn {
 namespace replication {
 DSN_DEFINE_bool(replication,
@@ -325,7 +344,11 @@ replica_stub::replica_stub(replica_state_subscriber subscriber /*= nullptr*/,
       METRIC_VAR_INIT_server(write_busy_requests),
       METRIC_VAR_INIT_server(bulk_load_running_count),
       METRIC_VAR_INIT_server(bulk_load_ingestion_max_duration_ms),
-      METRIC_VAR_INIT_server(bulk_load_max_duration_ms)
+      METRIC_VAR_INIT_server(bulk_load_max_duration_ms),
+      METRIC_VAR_INIT_server(splitting_replicas),
+      METRIC_VAR_INIT_server(splitting_replicas_max_duration_ms),
+      METRIC_VAR_INIT_server(splitting_replicas_async_learn_max_duration_ms),
+      METRIC_VAR_INIT_server(splitting_replicas_max_copy_file_bytes)
 {
 #ifdef DSN_ENABLE_GPERF
     _is_releasing_memory = false;
@@ -336,67 +359,10 @@ replica_stub::replica_stub(replica_state_subscriber subscriber /*= nullptr*/,
     _state = NS_Disconnected;
     _log = nullptr;
     _primary_address_str[0] = '\0';
-    install_perf_counters();
 }
 
 replica_stub::~replica_stub(void) { close(); }
 
-void replica_stub::install_perf_counters()
-{
-    // <- Partition split Metrics ->
-
-    _counter_replicas_splitting_count.init_app_counter("eon.replica_stub",
-                                                       "replicas.splitting.count",
-                                                       COUNTER_TYPE_NUMBER,
-                                                       "current partition splitting count");
-
-    _counter_replicas_splitting_max_duration_time_ms.init_app_counter(
-        "eon.replica_stub",
-        "replicas.splitting.max.duration.time(ms)",
-        COUNTER_TYPE_NUMBER,
-        "current partition splitting max duration time(ms)");
-    _counter_replicas_splitting_max_async_learn_time_ms.init_app_counter(
-        "eon.replica_stub",
-        "replicas.splitting.max.async.learn.time(ms)",
-        COUNTER_TYPE_NUMBER,
-        "current partition splitting max async learn time(ms)");
-    _counter_replicas_splitting_max_copy_file_size.init_app_counter(
-        "eon.replica_stub",
-        "replicas.splitting.max.copy.file.size",
-        COUNTER_TYPE_NUMBER,
-        "current splitting max copy file size");
-    _counter_replicas_splitting_recent_start_count.init_app_counter(
-        "eon.replica_stub",
-        "replicas.splitting.recent.start.count",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "current splitting start count in the recent period");
-    _counter_replicas_splitting_recent_copy_file_count.init_app_counter(
-        "eon.replica_stub",
-        "replicas.splitting.recent.copy.file.count",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "splitting copy file count in the recent period");
-    _counter_replicas_splitting_recent_copy_file_size.init_app_counter(
-        "eon.replica_stub",
-        "replicas.splitting.recent.copy.file.size",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "splitting copy file size in the recent period");
-    _counter_replicas_splitting_recent_copy_mutation_count.init_app_counter(
-        "eon.replica_stub",
-        "replicas.splitting.recent.copy.mutation.count",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "splitting copy mutation count in the recent period");
-    _counter_replicas_splitting_recent_split_succ_count.init_app_counter(
-        "eon.replica_stub",
-        "replicas.splitting.recent.split.succ.count",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "splitting succeed count in the recent period");
-    _counter_replicas_splitting_recent_split_fail_count.init_app_counter(
-        "eon.replica_stub",
-        "replicas.splitting.recent.split.fail.count",
-        COUNTER_TYPE_VOLATILE_NUMBER,
-        "splitting fail count in the recent period");
-}
-
 void replica_stub::initialize(bool clear /* = false*/)
 {
     replication_options opts;
@@ -1797,10 +1763,11 @@ void replica_stub::on_gc()
     METRIC_VAR_SET(bulk_load_running_count, bulk_load_running_count);
     METRIC_VAR_SET(bulk_load_ingestion_max_duration_ms, bulk_load_max_ingestion_time_ms);
     METRIC_VAR_SET(bulk_load_max_duration_ms, bulk_load_max_duration_time_ms);
-    _counter_replicas_splitting_count->set(splitting_count);
-    _counter_replicas_splitting_max_duration_time_ms->set(splitting_max_duration_time_ms);
-    _counter_replicas_splitting_max_async_learn_time_ms->set(splitting_max_async_learn_time_ms);
-    _counter_replicas_splitting_max_copy_file_size->set(splitting_max_copy_file_size);
+    METRIC_VAR_SET(splitting_replicas, splitting_count);
+    METRIC_VAR_SET(splitting_replicas_max_duration_ms, splitting_max_duration_time_ms);
+    METRIC_VAR_SET(splitting_replicas_async_learn_max_duration_ms,
+                   splitting_max_async_learn_time_ms);
+    METRIC_VAR_SET(splitting_replicas_max_copy_file_bytes, splitting_max_copy_file_size);
 
     LOG_INFO("finish to garbage collection, time_used_ns = {}", dsn_now_ns() - start);
 }
diff --git a/src/replica/replica_stub.h b/src/replica/replica_stub.h
index 41ceec774..2defe9b85 100644
--- a/src/replica/replica_stub.h
+++ b/src/replica/replica_stub.h
@@ -56,7 +56,6 @@
 #include "failure_detector/failure_detector_multimaster.h"
 #include "metadata_types.h"
 #include "partition_split_types.h"
-#include "perf_counter/perf_counter_wrapper.h"
 #include "replica.h"
 #include "replica/mutation_log.h"
 #include "replica_admin_types.h"
@@ -350,7 +349,6 @@ private:
     void trigger_checkpoint(replica_ptr r, bool is_emergency);
     void handle_log_failure(error_code err);
 
-    void install_perf_counters();
     dsn::error_code on_kill_replica(gpid id);
 
     void get_replica_info(/*out*/ replica_info &info, /*in*/ replica_ptr r);
@@ -497,7 +495,6 @@ private:
     std::atomic_bool _is_releasing_memory{false};
 #endif
 
-    // performance counters
     METRIC_VAR_DECLARE_gauge_int64(total_replicas);
     METRIC_VAR_DECLARE_gauge_int64(opening_replicas);
     METRIC_VAR_DECLARE_gauge_int64(closing_replicas);
@@ -527,17 +524,10 @@ private:
     METRIC_VAR_DECLARE_gauge_int64(bulk_load_ingestion_max_duration_ms);
     METRIC_VAR_DECLARE_gauge_int64(bulk_load_max_duration_ms);
 
-    // <- Partition split Metrics ->
-    perf_counter_wrapper _counter_replicas_splitting_count;
-    perf_counter_wrapper _counter_replicas_splitting_max_duration_time_ms;
-    perf_counter_wrapper _counter_replicas_splitting_max_async_learn_time_ms;
-    perf_counter_wrapper _counter_replicas_splitting_max_copy_file_size;
-    perf_counter_wrapper _counter_replicas_splitting_recent_start_count;
-    perf_counter_wrapper _counter_replicas_splitting_recent_copy_file_count;
-    perf_counter_wrapper _counter_replicas_splitting_recent_copy_file_size;
-    perf_counter_wrapper _counter_replicas_splitting_recent_copy_mutation_count;
-    perf_counter_wrapper _counter_replicas_splitting_recent_split_fail_count;
-    perf_counter_wrapper _counter_replicas_splitting_recent_split_succ_count;
+    METRIC_VAR_DECLARE_gauge_int64(splitting_replicas);
+    METRIC_VAR_DECLARE_gauge_int64(splitting_replicas_max_duration_ms);
+    METRIC_VAR_DECLARE_gauge_int64(splitting_replicas_async_learn_max_duration_ms);
+    METRIC_VAR_DECLARE_gauge_int64(splitting_replicas_max_copy_file_bytes);
 
     dsn::task_tracker _tracker;
 };
diff --git a/src/replica/split/replica_split_manager.cpp b/src/replica/split/replica_split_manager.cpp
index 0c63a5ebf..49cd9449f 100644
--- a/src/replica/split/replica_split_manager.cpp
+++ b/src/replica/split/replica_split_manager.cpp
@@ -29,8 +29,6 @@
 #include "dsn.layer2_types.h"
 #include "failure_detector/failure_detector_multimaster.h"
 #include "partition_split_types.h"
-#include "perf_counter/perf_counter.h"
-#include "perf_counter/perf_counter_wrapper.h"
 #include "replica/mutation_log.h"
 #include "replica/prepare_list.h"
 #include "replica/replica_context.h"
@@ -51,6 +49,36 @@
 #include "utils/string_view.h"
 #include "utils/thread_access_checker.h"
 
+METRIC_DEFINE_counter(replica,
+                      splitting_started_count,
+                      dsn::metric_unit::kPartitionSplittings,
+                      "The number of started splittings");
+
+METRIC_DEFINE_counter(replica,
+                      splitting_copy_file_count,
+                      dsn::metric_unit::kFiles,
+                      "The number of files copied for splitting");
+
+METRIC_DEFINE_counter(replica,
+                      splitting_copy_file_bytes,
+                      dsn::metric_unit::kBytes,
+                      "The size of files copied for splitting");
+
+METRIC_DEFINE_counter(replica,
+                      splitting_copy_mutation_count,
+                      dsn::metric_unit::kMutations,
+                      "The number of mutations copied for splitting");
+
+METRIC_DEFINE_counter(replica,
+                      splitting_failed_count,
+                      dsn::metric_unit::kPartitionSplittings,
+                      "The number of failed splittings");
+
+METRIC_DEFINE_counter(replica,
+                      splitting_successful_count,
+                      dsn::metric_unit::kPartitionSplittings,
+                      "The number of successful splittings");
+
 namespace dsn {
 namespace replication {
 
@@ -58,7 +86,15 @@ DSN_DECLARE_bool(empty_write_disabled);
 DSN_DECLARE_int32(max_mutation_count_in_prepare_list);
 
 replica_split_manager::replica_split_manager(replica *r)
-    : replica_base(r), _replica(r), _stub(r->get_replica_stub())
+    : replica_base(r),
+      _replica(r),
+      _stub(r->get_replica_stub()),
+      METRIC_VAR_INIT_replica(splitting_started_count),
+      METRIC_VAR_INIT_replica(splitting_copy_file_count),
+      METRIC_VAR_INIT_replica(splitting_copy_file_bytes),
+      METRIC_VAR_INIT_replica(splitting_copy_mutation_count),
+      METRIC_VAR_INIT_replica(splitting_failed_count),
+      METRIC_VAR_INIT_replica(splitting_successful_count)
 {
     _partition_version.store(_replica->_app_info.partition_count - 1);
 }
@@ -159,7 +195,7 @@ void replica_split_manager::child_init_replica(gpid parent_gpid,
                          get_gpid().thread_hash(),
                          std::chrono::seconds(3));
     _replica->_split_states.splitting_start_ts_ns = dsn_now_ns();
-    _stub->_counter_replicas_splitting_recent_start_count->increment();
+    METRIC_VAR_INCREMENT(splitting_started_count);
 
     LOG_INFO_PREFIX(
         "child initialize succeed, init_ballot={}, parent_gpid={}", init_ballot, parent_gpid);
@@ -469,8 +505,8 @@ replica_split_manager::child_apply_private_logs(std::vector<std::string> plog_fi
 
     _replica->_split_states.splitting_copy_file_count += plog_files.size();
     _replica->_split_states.splitting_copy_file_size += total_file_size;
-    _stub->_counter_replicas_splitting_recent_copy_file_count->add(plog_files.size());
-    _stub->_counter_replicas_splitting_recent_copy_file_size->add(total_file_size);
+    METRIC_VAR_INCREMENT_BY(splitting_copy_file_count, plog_files.size());
+    METRIC_VAR_INCREMENT_BY(splitting_copy_file_bytes, total_file_size);
 
     LOG_INFO_PREFIX("replay private_log files succeed, file count={}, app last_committed_decree={}",
                     plog_files.size(),
@@ -494,7 +530,7 @@ replica_split_manager::child_apply_private_logs(std::vector<std::string> plog_fi
         ++count;
     }
     _replica->_split_states.splitting_copy_mutation_count += count;
-    _stub->_counter_replicas_splitting_recent_copy_mutation_count->add(count);
+    METRIC_VAR_INCREMENT_BY(splitting_copy_mutation_count, count);
     plist.commit(last_committed_decree, COMMIT_TO_DECREE_HARD);
     LOG_INFO_PREFIX(
         "apply in-memory mutations succeed, mutation count={}, app last_committed_decree={}",
@@ -1096,11 +1132,10 @@ void replica_split_manager::child_partition_active(
         return;
     }
 
-    _stub->_counter_replicas_splitting_recent_split_succ_count->increment();
     _replica->_primary_states.last_prepare_decree_on_new_primary =
         _replica->_prepare_list->max_decree();
     _replica->update_configuration(config);
-    _stub->_counter_replicas_splitting_recent_split_succ_count->increment();
+    METRIC_VAR_INCREMENT(splitting_successful_count);
     LOG_INFO_PREFIX("child partition is active, status={}", enum_to_string(status()));
 }
 
@@ -1123,7 +1158,7 @@ void replica_split_manager::child_handle_split_error(
                          _replica->_split_states.parent_gpid,
                          _replica->_split_states.total_ms(),
                          _replica->_split_states.async_learn_ms());
-        _stub->_counter_replicas_splitting_recent_split_fail_count->increment();
+        METRIC_VAR_INCREMENT(splitting_failed_count);
         _replica->update_local_configuration_with_no_ballot_change(partition_status::PS_ERROR);
     }
 }
diff --git a/src/replica/split/replica_split_manager.h b/src/replica/split/replica_split_manager.h
index 9676e8494..7435e1705 100644
--- a/src/replica/split/replica_split_manager.h
+++ b/src/replica/split/replica_split_manager.h
@@ -32,6 +32,7 @@
 #include "replica/replica_base.h"
 #include "utils/error_code.h"
 #include "utils/fmt_logging.h"
+#include "utils/metrics.h"
 #include "utils/ports.h"
 
 namespace dsn {
@@ -251,6 +252,13 @@ private:
     // It will be updated each time when config sync from meta
     // TODO(heyuchen): clear it when primary parent clean up status
     split_status::type _meta_split_status{split_status::NOT_SPLIT};
+
+    METRIC_VAR_DECLARE_counter(splitting_started_count);
+    METRIC_VAR_DECLARE_counter(splitting_copy_file_count);
+    METRIC_VAR_DECLARE_counter(splitting_copy_file_bytes);
+    METRIC_VAR_DECLARE_counter(splitting_copy_mutation_count);
+    METRIC_VAR_DECLARE_counter(splitting_failed_count);
+    METRIC_VAR_DECLARE_counter(splitting_successful_count);
 };
 
 } // namespace replication
diff --git a/src/replica/test/replica_test.cpp b/src/replica/test/replica_test.cpp
index 9eb67eed5..0c8b97cff 100644
--- a/src/replica/test/replica_test.cpp
+++ b/src/replica/test/replica_test.cpp
@@ -80,7 +80,6 @@ public:
     void SetUp() override
     {
         FLAGS_enable_http_server = false;
-        stub->install_perf_counters();
         mock_app_info();
         _mock_replica = stub->generate_replica_ptr(_app_info, pid, partition_status::PS_PRIMARY, 1);
 
diff --git a/src/utils/metrics.h b/src/utils/metrics.h
index 566db835d..3b88ba5e9 100644
--- a/src/utils/metrics.h
+++ b/src/utils/metrics.h
@@ -662,6 +662,7 @@ enum class metric_unit : size_t
     kPercent,
     kReplicas,
     kPartitions,
+    kPartitionSplittings,
     kServers,
     kRequests,
     kResponses,


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pegasus.apache.org
For additional commands, e-mail: commits-help@pegasus.apache.org