You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by jd...@apache.org on 2017/08/04 22:40:04 UTC

kudu git commit: Expose running maintenance op info

Repository: kudu
Updated Branches:
  refs/heads/master d0acb5551 -> 943b1ae26


Expose running maintenance op info

Previously, the maintenance manager stored the number of instances
a given operation has running, but information like the start time
of an instance was not made available until the instance completed.
However, the start times of running instances can be helpful
information to display on the web UI.

This commit adds a collection of running instances to the
MaintenanceManager so the information mentioned above can be
accessed while operations are running. Additionally, instances of
operations now store the id of the thread on which they run.

Change-Id: Ide228d7e70deae3ae89d108cbd270f3f0f2580ca
Reviewed-on: http://gerrit.cloudera.org:8080/7537
Tested-by: Kudu Jenkins
Reviewed-by: Andrew Wong <aw...@cloudera.com>
Reviewed-by: Jean-Daniel Cryans <jd...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/943b1ae2
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/943b1ae2
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/943b1ae2

Branch: refs/heads/master
Commit: 943b1ae26e62a0c823276013f2e3d3b8705a08c6
Parents: d0acb55
Author: Sam Okrent <sa...@cloudera.com>
Authored: Fri Jul 28 15:42:34 2017 -0700
Committer: Jean-Daniel Cryans <jd...@apache.org>
Committed: Fri Aug 4 22:39:31 2017 +0000

----------------------------------------------------------------------
 src/kudu/tserver/tserver-path-handlers.cc | 10 ++---
 src/kudu/util/maintenance_manager-test.cc | 28 ++++++++++++
 src/kudu/util/maintenance_manager.cc      | 60 +++++++++++++++-----------
 src/kudu/util/maintenance_manager.h       | 43 ++++++++++++++----
 src/kudu/util/maintenance_manager.proto   | 19 +++++---
 5 files changed, 114 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/943b1ae2/src/kudu/tserver/tserver-path-handlers.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tserver-path-handlers.cc b/src/kudu/tserver/tserver-path-handlers.cc
index 476832d..10c425a 100644
--- a/src/kudu/tserver/tserver-path-handlers.cc
+++ b/src/kudu/tserver/tserver-path-handlers.cc
@@ -49,8 +49,8 @@ using kudu::consensus::ConsensusStatePB;
 using kudu::consensus::RaftPeerPB;
 using kudu::consensus::TransactionStatusPB;
 using kudu::MaintenanceManagerStatusPB;
-using kudu::MaintenanceManagerStatusPB_CompletedOpPB;
 using kudu::MaintenanceManagerStatusPB_MaintenanceOpPB;
+using kudu::MaintenanceManagerStatusPB_OpInstancePB;
 using kudu::tablet::Tablet;
 using kudu::tablet::TabletReplica;
 using kudu::tablet::TabletStatePB;
@@ -607,7 +607,7 @@ void TabletServerPathHandlers::HandleMaintenanceManagerPage(const Webserver::Web
   *output << "  <thead><tr><th>Name</th><th>Instances running</th></tr></thead>\n";
   *output << "<tbody>\n";
   for (int i = 0; i < ops_count; i++) {
-    MaintenanceManagerStatusPB_MaintenanceOpPB op_pb = pb.registered_operations(i);
+    const MaintenanceManagerStatusPB_MaintenanceOpPB& op_pb = pb.registered_operations(i);
     if (op_pb.running() > 0) {
       *output <<  Substitute("<tr><td>$0</td><td>$1</td></tr>\n",
                              EscapeForHtmlToString(op_pb.name()),
@@ -622,13 +622,13 @@ void TabletServerPathHandlers::HandleMaintenanceManagerPage(const Webserver::Web
       "<th>Time since op started</th></tr></thead>\n";
   *output << "<tbody>\n";
   for (int i = 0; i < pb.completed_operations_size(); i++) {
-    MaintenanceManagerStatusPB_CompletedOpPB op_pb = pb.completed_operations(i);
+    const MaintenanceManagerStatusPB_OpInstancePB& op_pb = pb.completed_operations(i);
     *output <<  Substitute("<tr><td>$0</td><td>$1</td><td>$2</td></tr>\n",
                            EscapeForHtmlToString(op_pb.name()),
                            HumanReadableElapsedTime::ToShortString(
                                op_pb.duration_millis() / 1000.0),
                            HumanReadableElapsedTime::ToShortString(
-                               op_pb.secs_since_start()));
+                               op_pb.millis_since_start() / 1000.0));
   }
   *output << "</tbody></table>\n";
 
@@ -638,7 +638,7 @@ void TabletServerPathHandlers::HandleMaintenanceManagerPage(const Webserver::Web
           << "       <th>Logs retained</th><th>Perf</th></tr></thead>\n";
   *output << "<tbody>\n";
   for (int i = 0; i < ops_count; i++) {
-    MaintenanceManagerStatusPB_MaintenanceOpPB op_pb = pb.registered_operations(i);
+    const MaintenanceManagerStatusPB_MaintenanceOpPB& op_pb = pb.registered_operations(i);
     if (op_pb.running() == 0) {
       *output << Substitute("<tr><td>$0</td><td>$1</td><td>$2</td><td>$3</td><td>$4</td></tr>\n",
                             EscapeForHtmlToString(op_pb.name()),

http://git-wip-us.apache.org/repos/asf/kudu/blob/943b1ae2/src/kudu/util/maintenance_manager-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/maintenance_manager-test.cc b/src/kudu/util/maintenance_manager-test.cc
index a16213e..07f7f22 100644
--- a/src/kudu/util/maintenance_manager-test.cc
+++ b/src/kudu/util/maintenance_manager-test.cc
@@ -293,6 +293,34 @@ TEST_F(MaintenanceManagerTest, TestLogRetentionPrioritization) {
   manager_->UnregisterOp(&op2);
 }
 
+// Test retrieving a list of an op's running instances
+TEST_F(MaintenanceManagerTest, TestRunningInstances) {
+  TestMaintenanceOp op("op", MaintenanceOp::HIGH_IO_USAGE);
+  op.set_perf_improvement(10);
+  op.set_remaining_runs(2);
+  op.set_sleep_time(MonoDelta::FromSeconds(1));
+  manager_->RegisterOp(&op);
+
+  // Check that running instances are added to the maintenance manager's collection,
+  // and fields are getting filled.
+  ASSERT_EVENTUALLY([&]() {
+      MaintenanceManagerStatusPB status_pb;
+      manager_->GetMaintenanceManagerStatusDump(&status_pb);
+      ASSERT_EQ(status_pb.running_operations_size(), 2);
+      const MaintenanceManagerStatusPB_OpInstancePB& instance1 = status_pb.running_operations(0);
+      const MaintenanceManagerStatusPB_OpInstancePB& instance2 = status_pb.running_operations(1);
+      ASSERT_EQ(instance1.name(), op.name());
+      ASSERT_NE(instance1.thread_id(), instance2.thread_id());
+    });
+
+  // Wait for instances to complete.
+  manager_->UnregisterOp(&op);
+
+  // Check that running instances are removed from collection after completion.
+  MaintenanceManagerStatusPB status_pb;
+  manager_->GetMaintenanceManagerStatusDump(&status_pb);
+  ASSERT_EQ(status_pb.running_operations_size(), 0);
+}
 // Test adding operations and make sure that the history of recently completed operations
 // is correct in that it wraps around and doesn't grow.
 TEST_F(MaintenanceManagerTest, TestCompletedOpsHistory) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/943b1ae2/src/kudu/util/maintenance_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/util/maintenance_manager.cc b/src/kudu/util/maintenance_manager.cc
index 4bcd888..212c123 100644
--- a/src/kudu/util/maintenance_manager.cc
+++ b/src/kudu/util/maintenance_manager.cc
@@ -21,8 +21,10 @@
 #include <memory>
 #include <stdint.h>
 #include <string>
+#include <thread>
 #include <utility>
 
+#include "kudu/gutil/map-util.h"
 #include "kudu/gutil/stringprintf.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/util/debug/trace_event.h"
@@ -32,6 +34,7 @@
 #include "kudu/util/metrics.h"
 #include "kudu/util/process_memory.h"
 #include "kudu/util/random_util.h"
+#include "kudu/util/scoped_cleanup.h"
 #include "kudu/util/stopwatch.h"
 #include "kudu/util/thread.h"
 #include "kudu/util/trace.h"
@@ -122,10 +125,10 @@ MaintenanceManager::MaintenanceManager(const Options& options)
       FLAGS_maintenance_manager_num_threads : options.num_threads),
     cond_(&lock_),
     shutdown_(false),
-    running_ops_(0),
     polling_interval_ms_(options.polling_interval_ms <= 0 ?
           FLAGS_maintenance_manager_polling_interval_ms :
           options.polling_interval_ms),
+    running_ops_(0),
     completed_ops_count_(0),
     rand_(GetRandomSeed32()),
     memory_pressure_func_(&process_memory::UnderMemoryPressure) {
@@ -428,8 +431,33 @@ MaintenanceOp* MaintenanceManager::FindBestOp() {
 }
 
 void MaintenanceManager::LaunchOp(MaintenanceOp* op) {
-  MonoTime start_time = MonoTime::Now();
+  std::thread::id thread_id = std::this_thread::get_id();
+  OpInstance op_instance;
+  op_instance.thread_id = thread_id;
+  op_instance.name = op->name();
+  op_instance.start_mono_time = MonoTime::Now();
   op->RunningGauge()->Increment();
+  {
+    std::lock_guard<Mutex> lock(lock_);
+    InsertOrDie(&running_instances_, thread_id, &op_instance);
+  }
+
+  auto cleanup = MakeScopedCleanup([&]{
+    op->RunningGauge()->Decrement();
+
+    std::lock_guard<Mutex> l(lock_);
+    running_instances_.erase(thread_id);
+    op_instance.duration = MonoTime::Now() - op_instance.start_mono_time;
+    completed_ops_[completed_ops_count_ % completed_ops_.size()] = op_instance;
+    completed_ops_count_++;
+
+    op->DurationHistogram()->Increment(op_instance.duration.ToMilliseconds());
+
+    running_ops_--;
+    op->running_--;
+    op->cond_->Signal();
+    cond_.Signal(); // wake up scheduler
+  });
 
   scoped_refptr<Trace> trace(new Trace);
   LOG_TIMING(INFO, Substitute("running $0", op->name())) {
@@ -439,23 +467,6 @@ void MaintenanceManager::LaunchOp(MaintenanceOp* op) {
     op->Perform();
   }
   LOG_WITH_PREFIX(INFO) << op->name() << " metrics: " << trace->MetricsAsJSON();
-
-  op->RunningGauge()->Decrement();
-  MonoDelta delta = MonoTime::Now() - start_time;
-
-  std::lock_guard<Mutex> l(lock_);
-  CompletedOp& completed_op = completed_ops_[completed_ops_count_ % completed_ops_.size()];
-  completed_op.name = op->name();
-  completed_op.duration = delta;
-  completed_op.start_mono_time = start_time;
-  completed_ops_count_++;
-
-  op->DurationHistogram()->Increment(delta.ToMilliseconds());
-
-  running_ops_--;
-  op->running_--;
-  op->cond_->Signal();
-  cond_.Signal(); // wake up scheduler
 }
 
 void MaintenanceManager::GetMaintenanceManagerStatusDump(MaintenanceManagerStatusPB* out_pb) {
@@ -485,18 +496,17 @@ void MaintenanceManager::GetMaintenanceManagerStatusDump(MaintenanceManagerStatu
     }
   }
 
+  for (const auto& running_instance : running_instances_) {
+    *out_pb->add_running_operations() = running_instance.second->DumpToPB();
+  }
+
   for (int n = 1; n <= completed_ops_.size(); n++) {
     int i = completed_ops_count_ - n;
     if (i < 0) break;
     const auto& completed_op = completed_ops_[i % completed_ops_.size()];
 
     if (!completed_op.name.empty()) {
-      MaintenanceManagerStatusPB_CompletedOpPB* completed_pb = out_pb->add_completed_operations();
-      completed_pb->set_name(completed_op.name);
-      completed_pb->set_duration_millis(completed_op.duration.ToMilliseconds());
-
-      MonoDelta delta(MonoTime::Now().GetDeltaSince(completed_op.start_mono_time));
-      completed_pb->set_secs_since_start(delta.ToSeconds());
+      *out_pb->add_completed_operations() = completed_op.DumpToPB();
     }
   }
 }

http://git-wip-us.apache.org/repos/asf/kudu/blob/943b1ae2/src/kudu/util/maintenance_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/util/maintenance_manager.h b/src/kudu/util/maintenance_manager.h
index 2509d46..ef2fdf1 100644
--- a/src/kudu/util/maintenance_manager.h
+++ b/src/kudu/util/maintenance_manager.h
@@ -24,6 +24,7 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <thread>
 #include <vector>
 
 #include "kudu/gutil/macros.h"
@@ -145,6 +146,31 @@ class MaintenanceOpStats {
   MonoTime last_modified_;
 };
 
+// Represents an instance of a maintenance operation.
+struct OpInstance {
+  // Id of thread the instance ran on.
+  std::thread::id thread_id;
+  // Name of operation.
+  std::string name;
+  // Time the operation took to run. Value is unitialized if instance is still running.
+  MonoDelta duration;
+  MonoTime start_mono_time;
+
+  MaintenanceManagerStatusPB_OpInstancePB DumpToPB() const {
+    MaintenanceManagerStatusPB_OpInstancePB pb;
+    std::stringstream ss;
+    ss << thread_id;
+    pb.set_thread_id(ss.str());
+    pb.set_name(name);
+    if (duration.Initialized()) {
+      pb.set_duration_millis(duration.ToMilliseconds());
+    }
+    MonoDelta delta(MonoTime::Now() - start_mono_time);
+    pb.set_millis_since_start(delta.ToMilliseconds());
+    return pb;
+  }
+};
+
 // MaintenanceOp objects represent background operations that the
 // MaintenanceManager can schedule.  Once a MaintenanceOp is registered, the
 // manager will periodically poll it for statistics.  The registrant is
@@ -242,13 +268,6 @@ struct MaintenanceOpComparator {
   }
 };
 
-// Holds the information regarding a recently completed operation.
-struct CompletedOp {
-  std::string name;
-  MonoDelta duration;
-  MonoTime start_mono_time;
-};
-
 // The MaintenanceManager manages the scheduling of background operations such
 // as flushes or compactions.  It runs these operations in the background, in a
 // thread pool.  It uses information provided in MaintenanceOpStats objects to
@@ -309,11 +328,17 @@ class MaintenanceManager : public std::enable_shared_from_this<MaintenanceManage
   gscoped_ptr<ThreadPool> thread_pool_;
   ConditionVariable cond_;
   bool shutdown_;
-  uint64_t running_ops_;
   int32_t polling_interval_ms_;
+  // Maps thread ids to instances of an op that they're running. Instances should be added
+  // right before MaintenanceOp::Perform() is called, and should be removed right after
+  // MaintenanceOp::Perform() completes. Any thread that adds an instance to this map
+  // owns that instance, and the instance should exist until the same thread removes it.
+  // Must acquire lock_ before accessing.
+  std::map<std::thread::id, OpInstance*> running_instances_;
+  uint64_t running_ops_;
   // Vector used as a circular buffer for recently completed ops. Elements need to be added at
   // the completed_ops_count_ % the vector's size and then the count needs to be incremented.
-  std::vector<CompletedOp> completed_ops_;
+  std::vector<OpInstance> completed_ops_;
   int64_t completed_ops_count_;
   std::string server_uuid_;
   Random rand_;

http://git-wip-us.apache.org/repos/asf/kudu/blob/943b1ae2/src/kudu/util/maintenance_manager.proto
----------------------------------------------------------------------
diff --git a/src/kudu/util/maintenance_manager.proto b/src/kudu/util/maintenance_manager.proto
index 75b0ab3..5e63875 100644
--- a/src/kudu/util/maintenance_manager.proto
+++ b/src/kudu/util/maintenance_manager.proto
@@ -31,11 +31,13 @@ message MaintenanceManagerStatusPB {
     required double perf_improvement = 6;
   }
 
-  message CompletedOpPB {
-    required string name = 1;
-    required int32 duration_millis = 2;
-    // Number of seconds since this operation started.
-    required int32 secs_since_start = 3;
+  message OpInstancePB {
+    required string thread_id = 1;
+    required string name = 2;
+    // How long the op took to run. Only present if the instance completed.
+    optional int32 duration_millis = 3;
+    // Number of milliseconds since this operation started.
+    required int32 millis_since_start = 4;
   }
 
   // The next operation that would run.
@@ -44,6 +46,9 @@ message MaintenanceManagerStatusPB {
   // List of all the operations.
   repeated MaintenanceOpPB registered_operations = 2;
 
-  // This list isn't in order of anything. Can contain the same operation mutiple times.
-  repeated CompletedOpPB completed_operations = 3;
+  // This list isn't in order of anything. Can contain the same operation multiple times.
+  repeated OpInstancePB running_operations = 3;
+
+  // This list isn't in order of anything. Can contain the same operation multiple times.
+  repeated OpInstancePB completed_operations = 4;
 }