You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by aw...@apache.org on 2019/10/08 22:57:00 UTC

[kudu] 02/03: KUDU-2069 p7: add tool to enter/exit maintenance

This is an automated email from the ASF dual-hosted git repository.

awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit ecc89cf10d849c56c941984a9f74d6eb8ffb81c1
Author: Andrew Wong <aw...@apache.org>
AuthorDate: Wed Sep 25 19:04:57 2019 -0700

    KUDU-2069 p7: add tool to enter/exit maintenance
    
    Adds the following tools:
    $ kudu tserver state enter_maintenance <masters> <ts_id>
    $ kudu tserver state exit_maintenance <masters> <ts_id>
    
    This opts to have users specify the transition they expect to enact,
    rather than the desired end state. This is because I expect the same
    tool to be used for decommissioning. When that materializes, it will be
    valuable to have users specify their expected starting state to avoid
    accidentally overwriting tserver states unintentionally.
    
    It also adds an option into the TSManager to allow the tool to raise an
    error when running it with a tserver that doesn't exist. This isn't
    always desired behavior of the TSManager because we will reload tserver
    states when the master first starts up, at which point tservers may not
    be registered yet. It is accessible through a CLI tool optional
    argument.
    
    Change-Id: Ifab64dcb5b7b0d171502c9154c83113c82e5972d
    Reviewed-on: http://gerrit.cloudera.org:8080/14368
    Tested-by: Kudu Jenkins
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
    Reviewed-by: Adar Dembo <ad...@cloudera.com>
---
 src/kudu/client/master_proxy_rpc.cc   |  3 ++
 src/kudu/master/master.proto          | 10 +++++
 src/kudu/master/master_service.cc     |  2 +-
 src/kudu/master/ts_manager.cc         | 13 ++++++
 src/kudu/master/ts_manager.h          |  1 +
 src/kudu/master/ts_state-test.cc      |  4 +-
 src/kudu/tools/kudu-tool-test.cc      | 75 +++++++++++++++++++++++++++++++++++
 src/kudu/tools/tool_action_common.cc  | 10 +++++
 src/kudu/tools/tool_action_tserver.cc | 63 +++++++++++++++++++++++++++++
 9 files changed, 179 insertions(+), 2 deletions(-)

diff --git a/src/kudu/client/master_proxy_rpc.cc b/src/kudu/client/master_proxy_rpc.cc
index 5f63d82..412d22c 100644
--- a/src/kudu/client/master_proxy_rpc.cc
+++ b/src/kudu/client/master_proxy_rpc.cc
@@ -50,6 +50,8 @@ namespace kudu {
 
 using master::AlterTableRequestPB;
 using master::AlterTableResponsePB;
+using master::ChangeTServerStateRequestPB;
+using master::ChangeTServerStateResponsePB;
 using master::CreateTableRequestPB;
 using master::CreateTableResponsePB;
 using master::DeleteTableRequestPB;
@@ -285,6 +287,7 @@ bool AsyncLeaderMasterRpc<ReqClass, RespClass>::RetryOrReconnectIfNecessary(
   return false;
 }
 
+template class AsyncLeaderMasterRpc<ChangeTServerStateRequestPB, ChangeTServerStateResponsePB>;
 template class AsyncLeaderMasterRpc<CreateTableRequestPB, CreateTableResponsePB>;
 template class AsyncLeaderMasterRpc<IsCreateTableDoneRequestPB, IsCreateTableDoneResponsePB>;
 template class AsyncLeaderMasterRpc<DeleteTableRequestPB, DeleteTableResponsePB>;
diff --git a/src/kudu/master/master.proto b/src/kudu/master/master.proto
index 63297cf..9d37612 100644
--- a/src/kudu/master/master.proto
+++ b/src/kudu/master/master.proto
@@ -858,6 +858,16 @@ message TServerStateChangePB {
 message ChangeTServerStateRequestPB {
   // TODO(awong): consider setting tserver state in batches.
   optional TServerStateChangePB change = 1;
+
+  enum HandleMissingTS {
+    UNKNOWN_HANDLING = 0;
+    ALLOW_MISSING_TSERVER = 1;
+    DONT_ALLOW_MISSING_TSERVER = 2;
+  }
+  // If set to ALLOW_MISSING_TSERVER, will allow setting the state of the
+  // tserver even if it hasn't been registered with the master and there is no
+  // existing state associated with that server.
+  optional HandleMissingTS handle_missing_tserver = 2 [default = DONT_ALLOW_MISSING_TSERVER];
 }
 
 message ChangeTServerStateResponsePB {
diff --git a/src/kudu/master/master_service.cc b/src/kudu/master/master_service.cc
index c206697..5c23933 100644
--- a/src/kudu/master/master_service.cc
+++ b/src/kudu/master/master_service.cc
@@ -221,7 +221,7 @@ void MasterServiceImpl::ChangeTServerState(const ChangeTServerStateRequestPB* re
 
   // Set the appropriate state for the given tserver.
   s = server_->ts_manager()->SetTServerState(ts_uuid, to_state,
-      server_->catalog_manager()->sys_catalog());
+      req->handle_missing_tserver(), server_->catalog_manager()->sys_catalog());
   if (PREDICT_FALSE(!s.ok())) {
     rpc->RespondFailure(s);
     return;
diff --git a/src/kudu/master/ts_manager.cc b/src/kudu/master/ts_manager.cc
index a44526b..9caaa32 100644
--- a/src/kudu/master/ts_manager.cc
+++ b/src/kudu/master/ts_manager.cc
@@ -231,12 +231,21 @@ void TSManager::GetDescriptorsAvailableForPlacement(TSDescriptorVector* descs) c
 
 Status TSManager::SetTServerState(const string& ts_uuid,
                                   TServerStatePB ts_state,
+                                  ChangeTServerStateRequestPB::HandleMissingTS handle_missing_ts,
                                   SysCatalogTable* sys_catalog) {
   lock_guard<RWMutex> l(ts_state_lock_);
   auto existing_state = FindWithDefault(ts_state_by_uuid_, ts_uuid, TServerStatePB::NONE);
   if (existing_state == ts_state) {
     return Status::OK();
   }
+  // If there is no existing state for the tserver, and the tserver hasn't yet
+  // been registered, return an error, as appropriate.
+  shared_ptr<TSDescriptor> ts_desc;
+  if (handle_missing_ts != ChangeTServerStateRequestPB::ALLOW_MISSING_TSERVER &&
+      existing_state == TServerStatePB::NONE && !LookupTSByUUID(ts_uuid, &ts_desc)) {
+    return Status::NotFound(Substitute("Requested tserver $0 has not been registered",
+                                       ts_uuid));
+  }
   if (ts_state == TServerStatePB::NONE) {
     RETURN_NOT_OK_PREPEND(sys_catalog->RemoveTServerState(ts_uuid),
         Substitute("Failed to remove tserver state for $0", ts_uuid));
@@ -245,6 +254,8 @@ Status TSManager::SetTServerState(const string& ts_uuid,
     // may have been ignored while in maintenance mode are reprocessed. To do
     // so, request full tablet reports across all tablet servers.
     SetAllTServersNeedFullTabletReports();
+    LOG(INFO) << Substitute("Unset tserver state for $0 from $1",
+                            ts_uuid, TServerStatePB_Name(existing_state));
     return Status::OK();
   }
   SysTServerStateEntryPB pb;
@@ -253,6 +264,8 @@ Status TSManager::SetTServerState(const string& ts_uuid,
       Substitute("Failed to set tserver state for $0 to $1",
                  ts_uuid, TServerStatePB_Name(ts_state)));
   InsertOrUpdate(&ts_state_by_uuid_, ts_uuid, ts_state);
+  LOG(INFO) << Substitute("Set tserver state for $0 to $1",
+                          ts_uuid, TServerStatePB_Name(ts_state));
   return Status::OK();
 }
 
diff --git a/src/kudu/master/ts_manager.h b/src/kudu/master/ts_manager.h
index cb0c6dc..7e3e42f 100644
--- a/src/kudu/master/ts_manager.h
+++ b/src/kudu/master/ts_manager.h
@@ -105,6 +105,7 @@ class TSManager {
   // tablet servers must report back a full tablet reports.
   Status SetTServerState(const std::string& ts_uuid,
                          TServerStatePB ts_state,
+                         ChangeTServerStateRequestPB::HandleMissingTS handle_missing_ts,
                          SysCatalogTable* sys_catalog);
 
   // Return the tserver state for the given tablet server UUID, or NONE if one
diff --git a/src/kudu/master/ts_state-test.cc b/src/kudu/master/ts_state-test.cc
index bb48871..6931fa7 100644
--- a/src/kudu/master/ts_state-test.cc
+++ b/src/kudu/master/ts_state-test.cc
@@ -108,7 +108,8 @@ class TServerStateTest : public KuduTest {
   // Sets the tserver state for the given tablet server to 'state'.
   Status SetTServerState(const string& tserver_uuid, TServerStatePB state) {
     return master_->ts_manager()->SetTServerState(
-        tserver_uuid, state, master_->catalog_manager()->sys_catalog());
+        tserver_uuid, state, ChangeTServerStateRequestPB::ALLOW_MISSING_TSERVER,
+        master_->catalog_manager()->sys_catalog());
   }
 
   // Pretends to be a tserver by sending a heartbeat to the master from the
@@ -317,6 +318,7 @@ TEST_F(TServerStateTest, MaintenanceModeTServerDoesntGetNewReplicas) {
 // Test to exercise the RPC endpoint to change the tserver state.
 TEST_F(TServerStateTest, TestRPCs) {
   ChangeTServerStateRequestPB req;
+  req.set_handle_missing_tserver(ChangeTServerStateRequestPB::ALLOW_MISSING_TSERVER);
   Status s;
   // Sends a state change RPC and ensures there's an error, matching the
   // input error string if provided.
diff --git a/src/kudu/tools/kudu-tool-test.cc b/src/kudu/tools/kudu-tool-test.cc
index 705d1d5..30a09a0 100644
--- a/src/kudu/tools/kudu-tool-test.cc
+++ b/src/kudu/tools/kudu-tool-test.cc
@@ -91,6 +91,10 @@
 #include "kudu/integration-tests/cluster_verifier.h"
 #include "kudu/integration-tests/mini_cluster_fs_inspector.h"
 #include "kudu/integration-tests/test_workload.h"
+#include "kudu/master/master.h"
+#include "kudu/master/master.pb.h"
+#include "kudu/master/mini_master.h"
+#include "kudu/master/ts_manager.h"
 #include "kudu/mini-cluster/external_mini_cluster.h"
 #include "kudu/mini-cluster/internal_mini_cluster.h"
 #include "kudu/mini-cluster/mini_cluster.h"
@@ -176,6 +180,9 @@ using kudu::itest::MiniClusterFsInspector;
 using kudu::itest::TServerDetails;
 using kudu::log::Log;
 using kudu::log::LogOptions;
+using kudu::master::MiniMaster;
+using kudu::master::TServerStatePB;
+using kudu::master::TSManager;
 using kudu::rpc::RpcController;
 using kudu::tablet::LocalTabletWriter;
 using kudu::tablet::Tablet;
@@ -1141,6 +1148,7 @@ TEST_F(ToolTest, TestModeHelp) {
         "dump_memtrackers.*Dump the memtrackers",
         "get_flags.*Get the gflags",
         "set_flag.*Change a gflag value",
+        "state.*Operate on the state",
         "status.*Get the status",
         "timestamp.*Get the current timestamp",
         "list.*List tablet servers"
@@ -1148,6 +1156,13 @@ TEST_F(ToolTest, TestModeHelp) {
     NO_FATALS(RunTestHelp("tserver", kTServerModeRegexes));
   }
   {
+    const vector<string> kTServerSetStateModeRegexes = {
+        "enter_maintenance.*Begin maintenance on the Tablet Server",
+        "exit_maintenance.*End maintenance of the Tablet Server",
+    };
+    NO_FATALS(RunTestHelp("tserver state", kTServerSetStateModeRegexes));
+  }
+  {
     const vector<string> kWalModeRegexes = {
         "dump.*Dump a WAL",
     };
@@ -4749,6 +4764,66 @@ TEST_F(ToolTest, TestFsSwappingDirectoriesFailsGracefully) {
       wal_root, JoinStrings(new_data_roots, ","))));
 }
 
+TEST_F(ToolTest, TestStartEndMaintenanceMode) {
+  NO_FATALS(StartMiniCluster());
+  // Perform the steps on a tserver that exists and one that doesn't.
+  const string& kDummyUuid = "foobaruuid";
+  MiniMaster* mini_master = mini_cluster_->mini_master();
+  const string& ts_uuid = mini_cluster_->mini_tablet_server(0)->uuid();
+  TSManager* ts_manager = mini_master->master()->ts_manager();
+
+  // First, do a sanity check that our tservers have no state at first.
+  ASSERT_EQ(TServerStatePB::NONE, ts_manager->GetTServerState(ts_uuid));
+  ASSERT_EQ(TServerStatePB::NONE, ts_manager->GetTServerState(kDummyUuid));
+
+  // Enter maintenance mode twice. The second time should no-op.
+  for (int i = 0; i < 2; i++) {
+    NO_FATALS(RunActionStdoutNone(
+        Substitute("tserver state enter_maintenance $0 $1",
+                  mini_master->bound_rpc_addr().ToString(), ts_uuid)));
+    ASSERT_EQ(TServerStatePB::MAINTENANCE_MODE, ts_manager->GetTServerState(ts_uuid));
+    ASSERT_EQ(TServerStatePB::NONE, ts_manager->GetTServerState(kDummyUuid));
+  }
+
+  // When entering maintenance mode on a tserver that doesn't exist, we should
+  // get an error.
+  string stderr;
+  ASSERT_TRUE(RunActionStderrString(
+      Substitute("tserver state enter_maintenance $0 $1",
+                 mini_master->bound_rpc_addr().ToString(), kDummyUuid), &stderr).IsRuntimeError());
+  ASSERT_STR_CONTAINS(stderr, "has not been registered");
+  ASSERT_EQ(TServerStatePB::MAINTENANCE_MODE, ts_manager->GetTServerState(ts_uuid));
+  ASSERT_EQ(TServerStatePB::NONE, ts_manager->GetTServerState(kDummyUuid));
+
+  // But operators are able to specify a flag to force the maintenance mode,
+  // despite the tserver not being registered.
+  NO_FATALS(RunActionStdoutNone(
+      Substitute("tserver state enter_maintenance $0 $1 --allow_missing_tserver",
+                 mini_master->bound_rpc_addr().ToString(), kDummyUuid)));
+  ASSERT_EQ(TServerStatePB::MAINTENANCE_MODE, ts_manager->GetTServerState(ts_uuid));
+  ASSERT_EQ(TServerStatePB::MAINTENANCE_MODE, ts_manager->GetTServerState(kDummyUuid));
+
+  // Exit maintenance mode twice. The second time should no-op.
+  for (int i = 0; i < 2; i++) {
+    NO_FATALS(RunActionStdoutNone(
+        Substitute("tserver state exit_maintenance $0 $1",
+                  mini_master->bound_rpc_addr().ToString(), ts_uuid)));
+    ASSERT_EQ(TServerStatePB::NONE, ts_manager->GetTServerState(ts_uuid));
+    ASSERT_EQ(TServerStatePB::MAINTENANCE_MODE, ts_manager->GetTServerState(kDummyUuid));
+  }
+
+  // No flag is necessary to remove the state of a tserver that hasn't been
+  // registered. The second time should no-op as well, even though the dummy
+  // tserver doesn't exist.
+  for (int i = 0; i < 2; i++) {
+    NO_FATALS(RunActionStdoutNone(
+        Substitute("tserver state exit_maintenance $0 $1",
+                  mini_master->bound_rpc_addr().ToString(), kDummyUuid)));
+    ASSERT_EQ(TServerStatePB::NONE, ts_manager->GetTServerState(ts_uuid));
+    ASSERT_EQ(TServerStatePB::NONE, ts_manager->GetTServerState(kDummyUuid));
+  }
+}
+
 TEST_F(ToolTest, TestFsRemoveDataDirWithTombstone) {
   if (FLAGS_block_manager == "file") {
     LOG(INFO) << "Skipping test, only log block manager is supported";
diff --git a/src/kudu/tools/tool_action_common.cc b/src/kudu/tools/tool_action_common.cc
index ce0efa4..a512ab2 100644
--- a/src/kudu/tools/tool_action_common.cc
+++ b/src/kudu/tools/tool_action_common.cc
@@ -811,6 +811,16 @@ Status LeaderMasterProxy::SyncRpc(const Req& req,
 // Explicit specializations for callers outside this compilation unit.
 template
 Status LeaderMasterProxy::SyncRpc(
+    const master::ChangeTServerStateRequestPB& req,
+    master::ChangeTServerStateResponsePB* resp,
+    string func_name,
+    const boost::function<void(MasterServiceProxy*,
+                               const master::ChangeTServerStateRequestPB&,
+                               master::ChangeTServerStateResponsePB*,
+                               RpcController*,
+                               const ResponseCallback&)>& func);
+template
+Status LeaderMasterProxy::SyncRpc(
     const master::ListTabletServersRequestPB& req,
     master::ListTabletServersResponsePB* resp,
     string func_name,
diff --git a/src/kudu/tools/tool_action_tserver.cc b/src/kudu/tools/tool_action_tserver.cc
index 95fa07e..744458c 100644
--- a/src/kudu/tools/tool_action_tserver.cc
+++ b/src/kudu/tools/tool_action_tserver.cc
@@ -23,6 +23,7 @@
 #include <vector>
 
 #include <boost/algorithm/string/predicate.hpp>
+#include <gflags/gflags.h>
 #include <gflags/gflags_declare.h>
 
 #include "kudu/common/common.pb.h"
@@ -40,6 +41,10 @@
 #include "kudu/tserver/tablet_server.h"
 #include "kudu/util/status.h"
 
+DEFINE_bool(allow_missing_tserver, false, "If true, performs the action on the "
+    "tserver even if it has not been registered with the master and has no "
+    "existing tserver state records associated with it.");
+
 DECLARE_string(columns);
 
 using std::cout;
@@ -49,9 +54,12 @@ using std::vector;
 
 namespace kudu {
 
+using master::ChangeTServerStateRequestPB;
+using master::ChangeTServerStateResponsePB;
 using master::ListTabletServersRequestPB;
 using master::ListTabletServersResponsePB;
 using master::MasterServiceProxy;
+using master::TServerStateChangePB;
 
 namespace tools {
 namespace {
@@ -60,6 +68,8 @@ const char* const kTServerAddressArg = "tserver_address";
 const char* const kTServerAddressDesc = "Address of a Kudu Tablet Server of "
     "form 'hostname:port'. Port may be omitted if the Tablet Server is bound "
     "to the default port.";
+const char* const kTServerIdArg = "tserver_uuid";
+const char* const kTServerIdDesc = "UUID of a Kudu Tablet Server";
 const char* const kFlagArg = "flag";
 const char* const kValueArg = "value";
 
@@ -161,6 +171,34 @@ Status TserverDumpMemTrackers(const RunnerContext& context) {
   return DumpMemTrackers(address, tserver::TabletServer::kDefaultPort);
 }
 
+Status TServerSetState(const RunnerContext& context, TServerStateChangePB::StateChange sc) {
+  ChangeTServerStateRequestPB req;
+  ChangeTServerStateResponsePB resp;
+  const string& tserver_uuid = FindOrDie(context.required_args, kTServerIdArg);
+  TServerStateChangePB* change = req.mutable_change();
+  change->set_uuid(tserver_uuid);
+  change->set_change(sc);
+  if (FLAGS_allow_missing_tserver) {
+    req.set_handle_missing_tserver(ChangeTServerStateRequestPB::ALLOW_MISSING_TSERVER);
+  }
+  LeaderMasterProxy proxy;
+  RETURN_NOT_OK(proxy.Init(context));
+  RETURN_NOT_OK((proxy.SyncRpc<ChangeTServerStateRequestPB, ChangeTServerStateResponsePB>(
+      req, &resp, "ChangeTServerState", &MasterServiceProxy::ChangeTServerStateAsync)));
+  if (resp.has_error()) {
+    return StatusFromPB(resp.error().status());
+  }
+  return Status::OK();
+}
+
+Status EnterMaintenance(const RunnerContext& context) {
+  return TServerSetState(context, TServerStateChangePB::ENTER_MAINTENANCE_MODE);
+}
+
+Status ExitMaintenance(const RunnerContext& context) {
+  return TServerSetState(context, TServerStateChangePB::EXIT_MAINTENANCE_MODE);
+}
+
 } // anonymous namespace
 
 unique_ptr<Mode> BuildTServerMode() {
@@ -215,6 +253,30 @@ unique_ptr<Mode> BuildTServerMode() {
       .AddOptionalParameter("timeout_ms")
       .Build();
 
+  unique_ptr<Action> enter_maintenance =
+      ActionBuilder("enter_maintenance", &EnterMaintenance)
+      .Description("Begin maintenance on the Tablet Server. While under "
+                   "maintenance, downtime of the Tablet Server will not lead "
+                   "to the immediate re-replication of its tablet replicas.")
+      .AddRequiredParameter({ kMasterAddressesArg, kMasterAddressesArgDesc })
+      .AddRequiredParameter({ kTServerIdArg, kTServerIdDesc })
+      .AddOptionalParameter("allow_missing_tserver")
+      .Build();
+  // Note: --allow_missing_tserver doesn't make sense for exit_maintenance
+  // because if the tserver is missing, the non-existent tserver's state is
+  // already NONE and so exit_maintenance is a no-op.
+  unique_ptr<Action> exit_maintenance =
+      ActionBuilder("exit_maintenance", &ExitMaintenance)
+      .Description("End maintenance of the Tablet Server.")
+      .AddRequiredParameter({ kMasterAddressesArg, kMasterAddressesArgDesc })
+      .AddRequiredParameter({ kTServerIdArg, kTServerIdDesc })
+      .Build();
+  unique_ptr<Mode> state = ModeBuilder("state")
+      .Description("Operate on the state of a Kudu Tablet Server")
+      .AddAction(std::move(enter_maintenance))
+      .AddAction(std::move(exit_maintenance))
+      .Build();
+
   return ModeBuilder("tserver")
       .Description("Operate on a Kudu Tablet Server")
       .AddAction(std::move(dump_memtrackers))
@@ -223,6 +285,7 @@ unique_ptr<Mode> BuildTServerMode() {
       .AddAction(std::move(status))
       .AddAction(std::move(timestamp))
       .AddAction(std::move(list_tservers))
+      .AddMode(std::move(state))
       .Build();
 }