You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by aw...@apache.org on 2019/10/08 22:57:01 UTC

[kudu] 03/03: KUDU-2069 p8: support listing tserver state through CLI

This is an automated email from the ASF dual-hosted git repository.

awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 1f4d473c74e7d57eeae3b384a46f4eb088e16223
Author: Andrew Wong <aw...@apache.org>
AuthorDate: Tue Oct 8 13:28:11 2019 -0700

    KUDU-2069 p8: support listing tserver state through CLI
    
    This patch adds support to the tserver list tool to show the tserver
    states.
    
    $ kudu tserver list <master> --columns=uuid,state
    
    I opted to have the state be an optional return field because there are
    cases where a tablet that is unregistered will have a state associated
    with it (e.g. if the master is restarted), in which case it may be
    surprising to see info about them, unless explicitly requested.
    
    Change-Id: I5917927f3bc4bb6bebd1cfb63555ec20d94d9091
    Reviewed-on: http://gerrit.cloudera.org:8080/14369
    Reviewed-by: Adar Dembo <ad...@cloudera.com>
    Tested-by: Kudu Jenkins
---
 src/kudu/master/master.proto          |  6 +++++
 src/kudu/master/master_service.cc     | 28 +++++++++++++++++++-
 src/kudu/master/ts_manager.cc         |  5 ++++
 src/kudu/master/ts_manager.h          |  7 ++++-
 src/kudu/tools/kudu-tool-test.cc      | 49 +++++++++++++++++++++++++++++++++++
 src/kudu/tools/tool_action_tserver.cc | 21 ++++++++++-----
 6 files changed, 108 insertions(+), 8 deletions(-)

diff --git a/src/kudu/master/master.proto b/src/kudu/master/master.proto
index 9d37612..e480b6c 100644
--- a/src/kudu/master/master.proto
+++ b/src/kudu/master/master.proto
@@ -805,6 +805,11 @@ message HiveMetastoreConfig {
 // ============================================================================
 
 message ListTabletServersRequestPB {
+  // Whether or not to include the tserver states.
+  // Note: this may include states of tservers that haven't been registered and
+  // thus don't have a complete response Entries. In such cases, the Entries
+  // will be returned with bogus info w.r.t registration and heartbeating.
+  optional bool include_states = 1;
 }
 
 message ListTabletServersResponsePB {
@@ -815,6 +820,7 @@ message ListTabletServersResponsePB {
     optional ServerRegistrationPB registration = 2;
     optional int32 millis_since_heartbeat = 3;
     optional string location = 4;
+    optional TServerStatePB state = 5;
   }
   repeated Entry servers = 2;
 }
diff --git a/src/kudu/master/master_service.cc b/src/kudu/master/master_service.cc
index 5c23933..5b20d6b 100644
--- a/src/kudu/master/master_service.cc
+++ b/src/kudu/master/master_service.cc
@@ -20,6 +20,7 @@
 #include <memory>
 #include <ostream>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -33,6 +34,7 @@
 #include "kudu/common/wire_protocol.pb.h"
 #include "kudu/consensus/metadata.pb.h"
 #include "kudu/consensus/replica_management.pb.h"
+#include "kudu/gutil/map-util.h"
 #include "kudu/gutil/port.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/hms/hms_catalog.h"
@@ -547,7 +549,10 @@ void MasterServiceImpl::GetTableSchema(const GetTableSchemaRequestPB* req,
 void MasterServiceImpl::ListTabletServers(const ListTabletServersRequestPB* req,
                                           ListTabletServersResponsePB* resp,
                                           rpc::RpcContext* rpc) {
-  vector<std::shared_ptr<TSDescriptor> > descs;
+  TSManager* ts_manager = server_->ts_manager();
+  TServerStateMap states = req->include_states() ?
+      ts_manager->GetTServerStates() : TServerStateMap();
+  vector<std::shared_ptr<TSDescriptor>> descs;
   server_->ts_manager()->GetAllDescriptors(&descs);
   for (const std::shared_ptr<TSDescriptor>& desc : descs) {
     ListTabletServersResponsePB::Entry* entry = resp->add_servers();
@@ -555,6 +560,27 @@ void MasterServiceImpl::ListTabletServers(const ListTabletServersRequestPB* req,
     desc->GetRegistration(entry->mutable_registration());
     entry->set_millis_since_heartbeat(desc->TimeSinceHeartbeat().ToMilliseconds());
     if (desc->location()) entry->set_location(desc->location().get());
+
+    // If we need to return states, do so.
+    const auto& uuid = desc->permanent_uuid();
+    const auto state = FindWithDefault(states, uuid, TServerStatePB::NONE);
+    entry->set_state(state);
+    states.erase(uuid);
+  }
+  // If there are any states left (e.g. they don't correspond to a registered
+  // server), report them. Set a bogus seqno, since the servers have not
+  // registered.
+  for (const auto& ts_and_state : states) {
+    const auto& ts = ts_and_state.first;
+    const auto& state = ts_and_state.second;
+    ListTabletServersResponsePB::Entry* entry = resp->add_servers();
+    NodeInstancePB* instance = entry->mutable_instance_id();
+    instance->set_permanent_uuid(ts);
+    instance->set_instance_seqno(-1);
+    entry->set_millis_since_heartbeat(-1);
+    // Note: The state map should only track non-NONE states.
+    DCHECK_NE(TServerStatePB::NONE, state);
+    entry->set_state(state);
   }
   rpc->RespondSuccess();
 }
diff --git a/src/kudu/master/ts_manager.cc b/src/kudu/master/ts_manager.cc
index 9caaa32..46f68d9 100644
--- a/src/kudu/master/ts_manager.cc
+++ b/src/kudu/master/ts_manager.cc
@@ -216,6 +216,11 @@ unordered_set<string> TSManager::GetUuidsToIgnoreForUnderreplication() const {
   return uuids;
 }
 
+TServerStateMap TSManager::GetTServerStates() const {
+  shared_lock<RWMutex> tsl(ts_state_lock_);
+  return ts_state_by_uuid_;
+}
+
 void TSManager::GetDescriptorsAvailableForPlacement(TSDescriptorVector* descs) const {
   descs->clear();
   shared_lock<RWMutex> tsl(ts_state_lock_);
diff --git a/src/kudu/master/ts_manager.h b/src/kudu/master/ts_manager.h
index 7e3e42f..dc5feb8 100644
--- a/src/kudu/master/ts_manager.h
+++ b/src/kudu/master/ts_manager.h
@@ -41,6 +41,8 @@ namespace master {
 class LocationCache;
 class SysCatalogTable;
 
+typedef std::unordered_map<std::string, TServerStatePB> TServerStateMap;
+
 // Tracks the servers that the master has heard from, along with their
 // last heartbeat, etc.
 //
@@ -96,6 +98,9 @@ class TSManager {
   // mode).
   std::unordered_set<std::string> GetUuidsToIgnoreForUnderreplication() const;
 
+  // Return the tablet server states for each tablet server UUID.
+  TServerStateMap GetTServerStates() const;
+
   // Get the TS count.
   int GetCount() const;
 
@@ -151,7 +156,7 @@ class TSManager {
 
   // Maps from the UUIDs of tablet servers to their tserver state, if any.
   // Note: the states don't necessarily belong to registered tablet servers.
-  std::unordered_map<std::string, TServerStatePB> ts_state_by_uuid_;
+  TServerStateMap ts_state_by_uuid_;
 
   LocationCache* location_cache_;
 
diff --git a/src/kudu/tools/kudu-tool-test.cc b/src/kudu/tools/kudu-tool-test.cc
index 30a09a0..a5066dc 100644
--- a/src/kudu/tools/kudu-tool-test.cc
+++ b/src/kudu/tools/kudu-tool-test.cc
@@ -2915,6 +2915,55 @@ TEST_F(ToolTest, TestTserverListLocationNotAssigned) {
   ASSERT_STR_CONTAINS(out, cluster_->tablet_server(0)->uuid() + ",<none>");
 }
 
+TEST_F(ToolTest, TestTServerListState) {
+  NO_FATALS(StartExternalMiniCluster());
+  string master_addr = cluster_->master()->bound_rpc_addr().ToString();
+  const string ts_uuid = cluster_->tablet_server(0)->uuid();
+
+  // First, set some tserver state.
+  NO_FATALS(RunActionStdoutNone(Substitute("tserver state enter_maintenance $0 $1",
+                                           master_addr, ts_uuid)));
+
+  // If the state isn't requested, we shouldn't see any.
+  string out;
+  NO_FATALS(RunActionStdoutString(
+        Substitute("tserver list $0 --columns=uuid --format=csv", master_addr), &out));
+  ASSERT_STR_NOT_CONTAINS(out, Substitute("$0,$1", ts_uuid, "MAINTENANCE_MODE"));
+
+  // If it is requested, we should see the state.
+  NO_FATALS(RunActionStdoutString(
+        Substitute("tserver list $0 --columns=uuid,state --format=csv", master_addr), &out));
+  ASSERT_STR_CONTAINS(out, Substitute("$0,$1", ts_uuid, "MAINTENANCE_MODE"));
+
+  // We should still see the state if the tserver hasn't been registered.
+  // "Unregister" the server by restarting the cluster and only bringing up the
+  // master, and verify that we still see the tserver in maintenance mode.
+  cluster_->Shutdown();
+  ASSERT_OK(cluster_->master()->Restart());
+  master_addr = cluster_->master()->bound_rpc_addr().ToString();
+  NO_FATALS(RunActionStdoutString(
+        Substitute("tserver list $0 --columns=uuid,state --format=csv", master_addr), &out));
+  ASSERT_STR_CONTAINS(out, Substitute("$0,$1", ts_uuid, "MAINTENANCE_MODE"));
+
+  NO_FATALS(RunActionStdoutNone(Substitute("tserver state exit_maintenance $0 $1",
+                                           master_addr, ts_uuid)));
+
+  // Once removed, we should see none.
+  // Since the tserver hasn't registered, there should be no output at all.
+  NO_FATALS(RunActionStdoutString(
+        Substitute("tserver list $0 --columns=uuid,state --format=csv", master_addr), &out));
+  ASSERT_EQ("", out);
+
+  // And once the tserver has registered, we should see no state. Assert
+  // eventually to give the tserver some time to register.
+  ASSERT_OK(cluster_->tablet_server(0)->Restart());
+  ASSERT_EVENTUALLY([&] {
+    NO_FATALS(RunActionStdoutString(
+          Substitute("tserver list $0 --columns=uuid,state --format=csv", master_addr), &out));
+    ASSERT_STR_CONTAINS(out, Substitute("$0,$1", ts_uuid, "NONE"));
+  });
+}
+
 TEST_F(ToolTest, TestMasterList) {
   ExternalMiniClusterOptions opts;
   opts.num_tablet_servers = 0;
diff --git a/src/kudu/tools/tool_action_tserver.cc b/src/kudu/tools/tool_action_tserver.cc
index 744458c..bf62257 100644
--- a/src/kudu/tools/tool_action_tserver.cc
+++ b/src/kudu/tools/tool_action_tserver.cc
@@ -32,7 +32,6 @@
 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/strings/join.h"
 #include "kudu/gutil/strings/split.h"
-#include "kudu/gutil/strings/stringpiece.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/master/master.pb.h"
 #include "kudu/master/master.proxy.h"
@@ -100,11 +99,17 @@ Status ListTServers(const RunnerContext& context) {
   LeaderMasterProxy proxy;
   RETURN_NOT_OK(proxy.Init(context));
 
+  const vector<string> cols = strings::Split(FLAGS_columns, ",", strings::SkipEmpty());
   ListTabletServersRequestPB req;
+  for (const auto& col : cols) {
+    if (boost::iequals(col, "state")) {
+      req.set_include_states(true);
+    }
+  }
   ListTabletServersResponsePB resp;
 
-  proxy.SyncRpc<ListTabletServersRequestPB, ListTabletServersResponsePB>(
-      req, &resp, "ListTabletServers", &MasterServiceProxy::ListTabletServersAsync);
+  RETURN_NOT_OK((proxy.SyncRpc<ListTabletServersRequestPB, ListTabletServersResponsePB>(
+      req, &resp, "ListTabletServers", &MasterServiceProxy::ListTabletServersAsync)));
 
   if (resp.has_error()) {
     return StatusFromPB(resp.error().status());
@@ -117,7 +122,7 @@ Status ListTServers(const RunnerContext& context) {
     return strings::Substitute("$0:$1", hostport.host(), hostport.port());
   };
 
-  for (const auto& column : strings::Split(FLAGS_columns, ",", strings::SkipEmpty())) {
+  for (const auto& column : cols) {
     vector<string> values;
     if (boost::iequals(column, "uuid")) {
       for (const auto& server : servers) {
@@ -156,10 +161,14 @@ Status ListTServers(const RunnerContext& context) {
       for (const auto& server : servers) {
         values.emplace_back(StartTimeToString(server.registration()));
       }
+    } else if (boost::iequals(column, "state")) {
+      for (const auto& server : servers) {
+        values.emplace_back(TServerStatePB_Name(server.state()));
+      }
     } else {
       return Status::InvalidArgument("unknown column (--columns)", column);
     }
-    table.AddColumn(column.ToString(), std::move(values));
+    table.AddColumn(column, std::move(values));
   }
 
   RETURN_NOT_OK(table.PrintTo(cout));
@@ -248,7 +257,7 @@ unique_ptr<Mode> BuildTServerMode() {
                             string("Comma-separated list of tserver info fields to "
                                    "include in output.\nPossible values: uuid, "
                                    "rpc-addresses, http-addresses, version, seqno, "
-                                   "heartbeat and start_time"))
+                                   "heartbeat, start_time, state"))
       .AddOptionalParameter("format")
       .AddOptionalParameter("timeout_ms")
       .Build();