You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by aw...@apache.org on 2021/11/03 06:59:19 UTC

[kudu] branch master updated (a31b96e -> 3e24e1b)

This is an automated email from the ASF dual-hosted git repository.

awong pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git.


    from a31b96e  KUDU-3332 [tools] Fix too large next_column_id after unsafe_rebuilding master
     new 59070bf  KUDU-1959 - Implement aggregate startup progress metrics
     new 3e24e1b  KUDU-1959 - Implement startup progress metrics related to containers and tablets

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/kudu/fs/fs_manager.cc               | 12 ++++++++
 src/kudu/fs/log_block_manager.cc        | 26 +++++++++++++++++
 src/kudu/server/server_base.cc          |  2 +-
 src/kudu/server/startup_path_handler.cc | 50 ++++++++++++++++++++++++++++++++-
 src/kudu/server/startup_path_handler.h  | 11 +++++++-
 src/kudu/tserver/ts_tablet_manager.cc   | 25 +++++++++++++++++
 src/kudu/tserver/ts_tablet_manager.h    |  4 +++
 7 files changed, 127 insertions(+), 3 deletions(-)

[kudu] 02/02: KUDU-1959 - Implement startup progress metrics related to containers and tablets

Posted by aw...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 3e24e1be4362ba9efb6d295ff96d3a18893f2733
Author: Abhishek Chennaka <ac...@cloudera.com>
AuthorDate: Mon Oct 18 22:03:23 2021 -0400

    KUDU-1959 - Implement startup progress metrics related to containers and tablets
    
    This patch implements the metrics related to the server startup.
    * In case of log block manager, we expose:
      - log_block_manager_total_containers_startup : total containers present,
      - log_block_manager_processed_containers_startup : count of containers
        opened/processed until the requested instant of time and
      - log_block_manager_containers_processing_time_startup : time elapsed
        for opening the containers. If the containers are not yet opened, we
        provide the time elapsed so far.
    * In case of tablet server, we expose:
      - tablets_num_total_startup : total tablets present,
      - tablets_num_opened_startup : count of tablets opened/processed until
        the requested instant of time and
      - tablets_opening_time_startup : time elapsed for opening the tablets.
        If the tablets are not yet opened, we provide the time elapsed so
        far.
    
    All the times are in milliseconds and the time metrics are in debug
    level.
    
    Change-Id: I9d1aa85b0585214475a6bdb8c0e5d7343c5bc3c9
    Reviewed-on: http://gerrit.cloudera.org:8080/17947
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
    Tested-by: Andrew Wong <aw...@cloudera.com>
---
 src/kudu/fs/fs_manager.cc             | 12 ++++++++++++
 src/kudu/fs/log_block_manager.cc      | 26 ++++++++++++++++++++++++++
 src/kudu/tserver/ts_tablet_manager.cc | 25 +++++++++++++++++++++++++
 src/kudu/tserver/ts_tablet_manager.h  |  4 ++++
 4 files changed, 67 insertions(+)

diff --git a/src/kudu/fs/fs_manager.cc b/src/kudu/fs/fs_manager.cc
index 2ecca51..d58a195 100644
--- a/src/kudu/fs/fs_manager.cc
+++ b/src/kudu/fs/fs_manager.cc
@@ -50,6 +50,7 @@
 #include "kudu/util/env_util.h"
 #include "kudu/util/flag_tags.h"
 #include "kudu/util/metrics.h"
+#include "kudu/util/monotime.h"
 #include "kudu/util/net/net_util.h"
 #include "kudu/util/oid_generator.h"
 #include "kudu/util/path_util.h"
@@ -108,6 +109,13 @@ DEFINE_int64(fs_wal_dir_reserved_bytes, -1,
 DEFINE_validator(fs_wal_dir_reserved_bytes, [](const char* /*n*/, int64_t v) { return v >= -1; });
 TAG_FLAG(fs_wal_dir_reserved_bytes, runtime);
 
+METRIC_DEFINE_gauge_int64(server, log_block_manager_containers_processing_time_startup,
+                          "Time taken to open all log block containers during server startup",
+                          kudu::MetricUnit::kMilliseconds,
+                          "The total time taken by the server to open all the container"
+                          "files during the startup",
+                          kudu::MetricLevel::kDebug);
+
 using kudu::fs::BlockManagerOptions;
 using kudu::fs::CreateBlockOptions;
 using kudu::fs::DataDirManager;
@@ -463,6 +471,10 @@ Status FsManager::Open(FsReport* report, Timer* read_instance_metadata_files,
     }
     if (read_data_directories) {
       read_data_directories->Stop();
+      if (opts_.metric_entity && opts_.block_manager_type == "log") {
+        METRIC_log_block_manager_containers_processing_time_startup.Instantiate(opts_.metric_entity,
+            (read_data_directories->TimeElapsed()).ToMilliseconds());
+      }
     }
   }
   // Report wal and metadata directories.
diff --git a/src/kudu/fs/log_block_manager.cc b/src/kudu/fs/log_block_manager.cc
index 8fcae4a..9a17cdb 100644
--- a/src/kudu/fs/log_block_manager.cc
+++ b/src/kudu/fs/log_block_manager.cc
@@ -159,6 +159,20 @@ METRIC_DEFINE_counter(server, log_block_manager_dead_containers_deleted,
                       "Number of full (but dead) block containers that were deleted",
                       kudu::MetricLevel::kDebug);
 
+METRIC_DEFINE_gauge_uint64(server, log_block_manager_total_containers_startup,
+                           "Total number of Log Block Containers during startup",
+                           kudu::MetricUnit::kLogBlockContainers,
+                           "Number of log block containers which were present during the server "
+                           "startup",
+                           kudu::MetricLevel::kInfo);
+
+METRIC_DEFINE_gauge_uint64(server, log_block_manager_processed_containers_startup,
+                           "Number of Log Block Containers opened during startup",
+                           kudu::MetricUnit::kLogBlockContainers,
+                           "Number of log block containers which were opened/processed during "
+                           "the server startup",
+                           kudu::MetricLevel::kInfo);
+
 namespace kudu {
 
 namespace fs {
@@ -202,6 +216,9 @@ struct LogBlockManagerMetrics {
   scoped_refptr<AtomicGauge<uint64_t>> containers;
   scoped_refptr<AtomicGauge<uint64_t>> full_containers;
 
+  scoped_refptr<AtomicGauge<uint64_t>> total_containers_startup;
+  scoped_refptr<AtomicGauge<uint64_t>> processed_containers_startup;
+
   scoped_refptr<Counter> holes_punched;
   scoped_refptr<Counter> dead_containers_deleted;
 };
@@ -214,6 +231,8 @@ LogBlockManagerMetrics::LogBlockManagerMetrics(const scoped_refptr<MetricEntity>
     GINIT(blocks_under_management),
     GINIT(containers),
     GINIT(full_containers),
+    GINIT(total_containers_startup),
+    GINIT(processed_containers_startup),
     MINIT(holes_punched),
     MINIT(dead_containers_deleted) {
 }
@@ -2542,8 +2561,12 @@ void LogBlockManager::OpenDataDir(
     }
     InsertIfNotPresent(&containers_seen, container_name);
   }
+
   if (containers_total) {
     *containers_total += containers_seen.size();
+    if (metrics_) {
+      metrics()->total_containers_startup->IncrementBy(containers_seen.size());
+    }
   }
 
   for (const string& container_name : containers_seen) {
@@ -2554,6 +2577,9 @@ void LogBlockManager::OpenDataDir(
         this, dir, &results->back()->report, container_name, &container);
     if (containers_processed) {
       ++*containers_processed;
+      if (metrics_) {
+        metrics()->processed_containers_startup->Increment();
+      }
     }
     if (!s.ok()) {
       if (s.IsAborted()) {
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index d64041f..752b82b 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -235,6 +235,24 @@ METRIC_DEFINE_gauge_int32(server, tablets_num_shutdown,
                           "Number of tablets currently shut down",
                           kudu::MetricLevel::kInfo);
 
+METRIC_DEFINE_gauge_uint32(server, tablets_num_total_startup,
+                           "Number of Tablets Present During Startup",
+                           kudu::MetricUnit::kTablets,
+                           "Number of tablets present during server startup",
+                           kudu::MetricLevel::kInfo);
+
+METRIC_DEFINE_gauge_uint32(server, tablets_num_opened_startup,
+                           "Number of Tablets Opened During Startup",
+                           kudu::MetricUnit::kTablets,
+                           "Number of tablets opened during server startup",
+                           kudu::MetricLevel::kInfo);
+
+METRIC_DEFINE_gauge_int64(server, tablets_opening_time_startup,
+                          "Time Taken to Start the Tablets During Startup",
+                          kudu::MetricUnit::kMilliseconds,
+                          "Time taken to start the tablets during server startup",
+                          kudu::MetricLevel::kDebug);
+
 DECLARE_int32(heartbeat_interval_ms);
 
 using kudu::consensus::ConsensusMetadata;
@@ -343,6 +361,9 @@ TSTabletManager::TSTabletManager(TabletServer* server)
         return this->RefreshTabletStateCacheAndReturnCount(tablet::SHUTDOWN);
       })
       ->AutoDetach(&metric_detacher_);
+
+  tablets_num_opened_startup_ = METRIC_tablets_num_opened_startup.Instantiate(
+      server->metric_entity(), 0);
 }
 
 // Base class for tasks submitted against TSTabletManager threadpools whose
@@ -482,6 +503,7 @@ Status TSTabletManager::Init(Timer* start_tablets,
 
   // Now submit the "Open" task for each.
   *tablets_total = metas.size();
+  METRIC_tablets_num_total_startup.Instantiate(server_->metric_entity(), *tablets_total);
   *tablets_processed = 0;
   int registered_count = 0;
   for (const auto& meta : metas) {
@@ -1357,8 +1379,11 @@ void TSTabletManager::IncrementTabletsProcessed(int tablets_total,
                                             Timer* start_tablets) {
   if (tablets_processed) {
     ++*tablets_processed;
+    tablets_num_opened_startup_->Increment();
     if (*tablets_processed == tablets_total) {
       start_tablets->Stop();
+      METRIC_tablets_opening_time_startup.Instantiate(server_->metric_entity(),
+          (start_tablets->TimeElapsed()).ToMilliseconds());
     }
   }
 }
diff --git a/src/kudu/tserver/ts_tablet_manager.h b/src/kudu/tserver/ts_tablet_manager.h
index e7e3a6c..ed45c8d 100644
--- a/src/kudu/tserver/ts_tablet_manager.h
+++ b/src/kudu/tserver/ts_tablet_manager.h
@@ -480,6 +480,10 @@ class TSTabletManager : public tserver::TabletReplicaLookupIf {
   mutable rw_spinlock lock_update_;
   MonoTime next_update_time_;
 
+  // Keep track of number of tablets opened/attempted to be opened
+  // during server startup
+  scoped_refptr<AtomicGauge<uint32_t>> tablets_num_opened_startup_;
+
   // NOTE: it's important that this is the first member to be destructed. This
   // ensures we do not attempt to collect metrics while calling the destructor.
   FunctionGaugeDetacher metric_detacher_;

[kudu] 01/02: KUDU-1959 - Implement aggregate startup progress metrics

Posted by aw...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 59070bf5bd5924c6e4deb68434744cac3b062dcc
Author: Abhishek Chennaka <ac...@cloudera.com>
AuthorDate: Mon Oct 11 11:36:55 2021 -0700

    KUDU-1959 - Implement aggregate startup progress metrics
    
    We expose the below metrics as a part of this commit:
    * startup_progress_steps_remaining : count of server startup steps which
      are yet to be completed. This value is in the range [0,4].
    * startup_progress_time_elapsed : the time elapsed so far for the server to
      startup. If the startup is completed, this is the total time taken for the
      startup. This is in milliseconds.
    These metrics are primarily expected to be used by third party monitoring tools
    to see how long has the server taken to startup historically for any sort of
    trend analysis.
    The startup_progress_time_elapsed metric can also be used to check the
    previous startup time as an alternative to the startup page in the WebUI.
    
    Change-Id: I0a508c3baf0a0d77baf75f36f7bb305a6ad821e1
    Reviewed-on: http://gerrit.cloudera.org:8080/17903
    Tested-by: Kudu Jenkins
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
---
 src/kudu/server/server_base.cc          |  2 +-
 src/kudu/server/startup_path_handler.cc | 50 ++++++++++++++++++++++++++++++++-
 src/kudu/server/startup_path_handler.h  | 11 +++++++-
 3 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/src/kudu/server/server_base.cc b/src/kudu/server/server_base.cc
index bbbcefa..ec44729 100644
--- a/src/kudu/server/server_base.cc
+++ b/src/kudu/server/server_base.cc
@@ -495,7 +495,7 @@ ServerBase::ServerBase(string name, const ServerBaseOptions& options,
       file_cache_(new FileCache("file cache", options.env,
                                 GetFileCacheCapacity(options.env), metric_entity_)),
       rpc_server_(new RpcServer(options.rpc_opts)),
-      startup_path_handler_(new StartupPathHandler),
+      startup_path_handler_(new StartupPathHandler(metric_entity_)),
       result_tracker_(new rpc::ResultTracker(shared_ptr<MemTracker>(
           MemTracker::CreateTracker(-1, "result-tracker", mem_tracker_)))),
       is_first_run_(false),
diff --git a/src/kudu/server/startup_path_handler.cc b/src/kudu/server/startup_path_handler.cc
index 93f839e..6e1948f 100644
--- a/src/kudu/server/startup_path_handler.cc
+++ b/src/kudu/server/startup_path_handler.cc
@@ -24,10 +24,24 @@
 #include "kudu/gutil/strings/human_readable.h"
 #include "kudu/server/webserver.h"
 #include "kudu/util/easy_json.h"
+#include "kudu/util/metrics.h"
 #include "kudu/util/monotime.h"
 #include "kudu/util/timer.h"
 #include "kudu/util/web_callback_registry.h"
 
+METRIC_DEFINE_gauge_int32(server, startup_progress_steps_remaining,
+                          "Server Startup Steps Remaining",
+                          kudu::MetricUnit::kUnits,
+                          "Server startup progress steps remaining ",
+                          kudu::MetricLevel::kWarn);
+
+METRIC_DEFINE_gauge_int64(server, startup_progress_time_elapsed,
+                          "Server Startup Progress Time Elapsed",
+                          kudu::MetricUnit::kMilliseconds,
+                          "Time taken by the server to complete the startup or"
+                          "time elapsed so far for the server to startup",
+                          kudu::MetricLevel::kInfo);
+
 using std::ifstream;
 using std::ostringstream;
 using std::string;
@@ -43,13 +57,19 @@ void SetWebResponse(EasyJson* output, const string& step,
                   (startup_step.TimeElapsed()).ToSeconds()));
 }
 
-StartupPathHandler::StartupPathHandler():
+StartupPathHandler::StartupPathHandler(const scoped_refptr<MetricEntity>& entity):
   tablets_processed_(0),
   tablets_total_(0),
   containers_processed_(0),
   containers_total_(0),
   is_tablet_server_(false),
   is_using_lbm_(true) {
+  METRIC_startup_progress_steps_remaining.InstantiateFunctionGauge(entity,
+      [this]() {return StartupProgressStepsRemainingMetric();})
+      ->AutoDetachToLastValue(&metric_detacher_);
+  METRIC_startup_progress_time_elapsed.InstantiateFunctionGauge(entity,
+      [this]() {return StartupProgressTimeElapsedMetric().ToMilliseconds();})
+      ->AutoDetachToLastValue(&metric_detacher_);
 }
 
 void StartupPathHandler::Startup(const Webserver::WebRequest& /*req*/,
@@ -116,5 +136,33 @@ void StartupPathHandler::set_is_tablet_server(bool is_tablet_server) {
 void StartupPathHandler::set_is_using_lbm(bool is_using_lbm) {
   is_using_lbm_ = is_using_lbm;
 }
+
+int StartupPathHandler::StartupProgressStepsRemainingMetric() {
+  int counter = 0;
+  counter += (init_progress_.IsStopped() ? 0 : 1);
+  counter += (read_filesystem_progress_.IsStopped() ? 0 : 1);
+  counter += (is_tablet_server_ ? (start_tablets_progress_.IsStopped() ? 0 : 1) : 0);
+  if (is_tablet_server_) {
+    counter += start_tablets_progress_.IsStopped() ? 0 : 1;
+  } else {
+    counter += initialize_master_catalog_progress_.IsStopped() ? 0 : 1;
+  }
+  counter += (start_rpc_server_progress_.IsStopped() ? 0 : 1);
+  return counter;
+}
+
+MonoDelta StartupPathHandler::StartupProgressTimeElapsedMetric() {
+  MonoDelta time_elapsed;
+  time_elapsed = init_progress_.TimeElapsed();
+  time_elapsed += read_filesystem_progress_.TimeElapsed();
+  if (is_tablet_server_) {
+    time_elapsed += start_tablets_progress_.TimeElapsed();
+  } else {
+    time_elapsed += initialize_master_catalog_progress_.TimeElapsed();
+  }
+  time_elapsed += start_rpc_server_progress_.TimeElapsed();
+  return time_elapsed;
+}
+
 } // namespace server
 } // namespace kudu
diff --git a/src/kudu/server/startup_path_handler.h b/src/kudu/server/startup_path_handler.h
index 1a32281..33ef1f4 100644
--- a/src/kudu/server/startup_path_handler.h
+++ b/src/kudu/server/startup_path_handler.h
@@ -18,7 +18,10 @@
 
 #include <atomic>
 
+#include "kudu/gutil/ref_counted.h"
 #include "kudu/server/webserver.h"
+#include "kudu/util/metrics.h"
+#include "kudu/util/monotime.h"
 #include "kudu/util/timer.h"
 
 namespace kudu {
@@ -28,7 +31,7 @@ namespace server {
 class StartupPathHandler {
 public:
 
-  StartupPathHandler();
+  explicit StartupPathHandler(const scoped_refptr<MetricEntity>& entity);
 
   // Populate the response output with the current information
   void Startup(const Webserver::WebRequest &req, Webserver::WebResponse *resp);
@@ -50,6 +53,10 @@ public:
   void set_is_tablet_server(bool is_tablet_server);
   void set_is_using_lbm(bool is_using_lbm);
 
+  // Call back functions for aggregate percentage and time elapsed
+  int StartupProgressStepsRemainingMetric();
+  MonoDelta StartupProgressTimeElapsedMetric();
+
 private:
   // Hold the initialization step progress information like the status, start and end time.
   Timer init_progress_;
@@ -91,6 +98,8 @@ private:
   // We do not open containers if file block manager is being used and hence display different
   // webpage contents if file block manager is being used.
   bool is_using_lbm_;
+
+  FunctionGaugeDetacher metric_detacher_;
 };
 
 } // namespace server