You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by aw...@apache.org on 2021/11/03 06:59:21 UTC

[kudu] 02/02: KUDU-1959 - Implement startup progress metrics related to containers and tablets

This is an automated email from the ASF dual-hosted git repository.

awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 3e24e1be4362ba9efb6d295ff96d3a18893f2733
Author: Abhishek Chennaka <ac...@cloudera.com>
AuthorDate: Mon Oct 18 22:03:23 2021 -0400

    KUDU-1959 - Implement startup progress metrics related to containers and tablets
    
    This patch implements the metrics related to the server startup.
    * In case of log block manager, we expose:
      - log_block_manager_total_containers_startup : total containers present,
      - log_block_manager_processed_containers_startup : count of containers
        opened/processed until the requested instant of time and
      - log_block_manager_containers_processing_time_startup : time elapsed
        for opening the containers. If the containers are not yet opened, we
        provide the time elapsed so far.
    * In case of tablet server, we expose:
      - tablets_num_total_startup : total tablets present,
      - tablets_num_opened_startup : count of tablets opened/processed until
        the requested instant of time and
      - tablets_opening_time_startup : time elapsed for opening the tablets.
        If the tablets are not yet opened, we provide the time elapsed so
        far.
    
    All the times are in milliseconds and the time metrics are in debug
    level.
    
    Change-Id: I9d1aa85b0585214475a6bdb8c0e5d7343c5bc3c9
    Reviewed-on: http://gerrit.cloudera.org:8080/17947
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
    Tested-by: Andrew Wong <aw...@cloudera.com>
---
 src/kudu/fs/fs_manager.cc             | 12 ++++++++++++
 src/kudu/fs/log_block_manager.cc      | 26 ++++++++++++++++++++++++++
 src/kudu/tserver/ts_tablet_manager.cc | 25 +++++++++++++++++++++++++
 src/kudu/tserver/ts_tablet_manager.h  |  4 ++++
 4 files changed, 67 insertions(+)

diff --git a/src/kudu/fs/fs_manager.cc b/src/kudu/fs/fs_manager.cc
index 2ecca51..d58a195 100644
--- a/src/kudu/fs/fs_manager.cc
+++ b/src/kudu/fs/fs_manager.cc
@@ -50,6 +50,7 @@
 #include "kudu/util/env_util.h"
 #include "kudu/util/flag_tags.h"
 #include "kudu/util/metrics.h"
+#include "kudu/util/monotime.h"
 #include "kudu/util/net/net_util.h"
 #include "kudu/util/oid_generator.h"
 #include "kudu/util/path_util.h"
@@ -108,6 +109,13 @@ DEFINE_int64(fs_wal_dir_reserved_bytes, -1,
 DEFINE_validator(fs_wal_dir_reserved_bytes, [](const char* /*n*/, int64_t v) { return v >= -1; });
 TAG_FLAG(fs_wal_dir_reserved_bytes, runtime);
 
+METRIC_DEFINE_gauge_int64(server, log_block_manager_containers_processing_time_startup,
+                          "Time taken to open all log block containers during server startup",
+                          kudu::MetricUnit::kMilliseconds,
+                          "The total time taken by the server to open all the container"
+                          "files during the startup",
+                          kudu::MetricLevel::kDebug);
+
 using kudu::fs::BlockManagerOptions;
 using kudu::fs::CreateBlockOptions;
 using kudu::fs::DataDirManager;
@@ -463,6 +471,10 @@ Status FsManager::Open(FsReport* report, Timer* read_instance_metadata_files,
     }
     if (read_data_directories) {
       read_data_directories->Stop();
+      if (opts_.metric_entity && opts_.block_manager_type == "log") {
+        METRIC_log_block_manager_containers_processing_time_startup.Instantiate(opts_.metric_entity,
+            (read_data_directories->TimeElapsed()).ToMilliseconds());
+      }
     }
   }
   // Report wal and metadata directories.
diff --git a/src/kudu/fs/log_block_manager.cc b/src/kudu/fs/log_block_manager.cc
index 8fcae4a..9a17cdb 100644
--- a/src/kudu/fs/log_block_manager.cc
+++ b/src/kudu/fs/log_block_manager.cc
@@ -159,6 +159,20 @@ METRIC_DEFINE_counter(server, log_block_manager_dead_containers_deleted,
                       "Number of full (but dead) block containers that were deleted",
                       kudu::MetricLevel::kDebug);
 
+METRIC_DEFINE_gauge_uint64(server, log_block_manager_total_containers_startup,
+                           "Total number of Log Block Containers during startup",
+                           kudu::MetricUnit::kLogBlockContainers,
+                           "Number of log block containers which were present during the server "
+                           "startup",
+                           kudu::MetricLevel::kInfo);
+
+METRIC_DEFINE_gauge_uint64(server, log_block_manager_processed_containers_startup,
+                           "Number of Log Block Containers opened during startup",
+                           kudu::MetricUnit::kLogBlockContainers,
+                           "Number of log block containers which were opened/processed during "
+                           "the server startup",
+                           kudu::MetricLevel::kInfo);
+
 namespace kudu {
 
 namespace fs {
@@ -202,6 +216,9 @@ struct LogBlockManagerMetrics {
   scoped_refptr<AtomicGauge<uint64_t>> containers;
   scoped_refptr<AtomicGauge<uint64_t>> full_containers;
 
+  scoped_refptr<AtomicGauge<uint64_t>> total_containers_startup;
+  scoped_refptr<AtomicGauge<uint64_t>> processed_containers_startup;
+
   scoped_refptr<Counter> holes_punched;
   scoped_refptr<Counter> dead_containers_deleted;
 };
@@ -214,6 +231,8 @@ LogBlockManagerMetrics::LogBlockManagerMetrics(const scoped_refptr<MetricEntity>
     GINIT(blocks_under_management),
     GINIT(containers),
     GINIT(full_containers),
+    GINIT(total_containers_startup),
+    GINIT(processed_containers_startup),
     MINIT(holes_punched),
     MINIT(dead_containers_deleted) {
 }
@@ -2542,8 +2561,12 @@ void LogBlockManager::OpenDataDir(
     }
     InsertIfNotPresent(&containers_seen, container_name);
   }
+
   if (containers_total) {
     *containers_total += containers_seen.size();
+    if (metrics_) {
+      metrics()->total_containers_startup->IncrementBy(containers_seen.size());
+    }
   }
 
   for (const string& container_name : containers_seen) {
@@ -2554,6 +2577,9 @@ void LogBlockManager::OpenDataDir(
         this, dir, &results->back()->report, container_name, &container);
     if (containers_processed) {
       ++*containers_processed;
+      if (metrics_) {
+        metrics()->processed_containers_startup->Increment();
+      }
     }
     if (!s.ok()) {
       if (s.IsAborted()) {
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index d64041f..752b82b 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -235,6 +235,24 @@ METRIC_DEFINE_gauge_int32(server, tablets_num_shutdown,
                           "Number of tablets currently shut down",
                           kudu::MetricLevel::kInfo);
 
+METRIC_DEFINE_gauge_uint32(server, tablets_num_total_startup,
+                           "Number of Tablets Present During Startup",
+                           kudu::MetricUnit::kTablets,
+                           "Number of tablets present during server startup",
+                           kudu::MetricLevel::kInfo);
+
+METRIC_DEFINE_gauge_uint32(server, tablets_num_opened_startup,
+                           "Number of Tablets Opened During Startup",
+                           kudu::MetricUnit::kTablets,
+                           "Number of tablets opened during server startup",
+                           kudu::MetricLevel::kInfo);
+
+METRIC_DEFINE_gauge_int64(server, tablets_opening_time_startup,
+                          "Time Taken to Start the Tablets During Startup",
+                          kudu::MetricUnit::kMilliseconds,
+                          "Time taken to start the tablets during server startup",
+                          kudu::MetricLevel::kDebug);
+
 DECLARE_int32(heartbeat_interval_ms);
 
 using kudu::consensus::ConsensusMetadata;
@@ -343,6 +361,9 @@ TSTabletManager::TSTabletManager(TabletServer* server)
         return this->RefreshTabletStateCacheAndReturnCount(tablet::SHUTDOWN);
       })
       ->AutoDetach(&metric_detacher_);
+
+  tablets_num_opened_startup_ = METRIC_tablets_num_opened_startup.Instantiate(
+      server->metric_entity(), 0);
 }
 
 // Base class for tasks submitted against TSTabletManager threadpools whose
@@ -482,6 +503,7 @@ Status TSTabletManager::Init(Timer* start_tablets,
 
   // Now submit the "Open" task for each.
   *tablets_total = metas.size();
+  METRIC_tablets_num_total_startup.Instantiate(server_->metric_entity(), *tablets_total);
   *tablets_processed = 0;
   int registered_count = 0;
   for (const auto& meta : metas) {
@@ -1357,8 +1379,11 @@ void TSTabletManager::IncrementTabletsProcessed(int tablets_total,
                                             Timer* start_tablets) {
   if (tablets_processed) {
     ++*tablets_processed;
+    tablets_num_opened_startup_->Increment();
     if (*tablets_processed == tablets_total) {
       start_tablets->Stop();
+      METRIC_tablets_opening_time_startup.Instantiate(server_->metric_entity(),
+          (start_tablets->TimeElapsed()).ToMilliseconds());
     }
   }
 }
diff --git a/src/kudu/tserver/ts_tablet_manager.h b/src/kudu/tserver/ts_tablet_manager.h
index e7e3a6c..ed45c8d 100644
--- a/src/kudu/tserver/ts_tablet_manager.h
+++ b/src/kudu/tserver/ts_tablet_manager.h
@@ -480,6 +480,10 @@ class TSTabletManager : public tserver::TabletReplicaLookupIf {
   mutable rw_spinlock lock_update_;
   MonoTime next_update_time_;
 
+  // Keep track of number of tablets opened/attempted to be opened
+  // during server startup
+  scoped_refptr<AtomicGauge<uint32_t>> tablets_num_opened_startup_;
+
   // NOTE: it's important that this is the first member to be destructed. This
   // ensures we do not attempt to collect metrics while calling the destructor.
   FunctionGaugeDetacher metric_detacher_;