You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2017/11/29 07:21:23 UTC

[3/3] kudu git commit: KUDU-1097 (patch 5b): kudu tablet config_change move should use 3-4-3

KUDU-1097 (patch 5b): kudu tablet config_change move should use 3-4-3

This patch adds support for 3-4-3 re-replication to the "replica move"
tool.

Change-Id: I259aeaf24f60a05b88c5c3e0e11d603aa75646a5
Reviewed-on: http://gerrit.cloudera.org:8080/8645
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <as...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/9120cdd1
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/9120cdd1
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/9120cdd1

Branch: refs/heads/master
Commit: 9120cdd1d288ef6b3e03e2cd8445436712a7d4a9
Parents: eb0718e
Author: Mike Percy <mp...@apache.org>
Authored: Mon Nov 27 02:46:24 2017 -0800
Committer: Mike Percy <mp...@apache.org>
Committed: Wed Nov 29 07:20:56 2017 +0000

----------------------------------------------------------------------
 .../integration-tests/cluster_itest_util.cc     |  22 +++-
 src/kudu/integration-tests/cluster_itest_util.h |  11 +-
 .../raft_config_change-itest.cc                 |   2 +-
 src/kudu/tools/kudu-admin-test.cc               |  52 +++++++-
 src/kudu/tools/tool_action_tablet.cc            | 127 ++++++++++++++++---
 5 files changed, 185 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/9120cdd1/src/kudu/integration-tests/cluster_itest_util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/cluster_itest_util.cc b/src/kudu/integration-tests/cluster_itest_util.cc
index 757f6d0..089dba1 100644
--- a/src/kudu/integration-tests/cluster_itest_util.cc
+++ b/src/kudu/integration-tests/cluster_itest_util.cc
@@ -23,6 +23,7 @@
 #include <boost/optional/optional.hpp>
 #include <glog/logging.h>
 #include <glog/stl_logging.h>
+#include <gtest/gtest.h>
 #include <rapidjson/document.h>
 
 #include "kudu/client/schema.h"
@@ -58,6 +59,8 @@
 #include "kudu/util/net/sockaddr.h"
 #include "kudu/util/pb_util.h"
 #include "kudu/util/status.h"
+#include "kudu/util/test_macros.h"
+#include "kudu/util/test_util.h"
 
 namespace kudu {
 namespace itest {
@@ -366,7 +369,7 @@ Status WaitUntilNoPendingConfig(const TServerDetails* replica,
                                      SecureShortDebugString(cstate_tmp), s.ToString()));
 }
 
-Status WaitUntilCommittedConfigNumVotersIs(int config_size,
+Status WaitUntilCommittedConfigNumVotersIs(int num_voters,
                                            const TServerDetails* replica,
                                            const std::string& tablet_id,
                                            const MonoDelta& timeout) {
@@ -381,7 +384,7 @@ Status WaitUntilCommittedConfigNumVotersIs(int config_size,
     MonoDelta remaining_timeout = deadline - MonoTime::Now();
     s = GetConsensusState(replica, tablet_id, remaining_timeout, &cstate);
     if (s.ok()) {
-      if (CountVoters(cstate.committed_config()) == config_size) {
+      if (CountVoters(cstate.committed_config()) == num_voters) {
         return Status::OK();
       }
     }
@@ -394,10 +397,23 @@ Status WaitUntilCommittedConfigNumVotersIs(int config_size,
   }
   return Status::TimedOut(Substitute("Number of voters does not equal $0 after waiting for $1. "
                                      "Last consensus state: $2. Last status: $3",
-                                     config_size, timeout.ToString(),
+                                     num_voters, timeout.ToString(),
                                      SecureShortDebugString(cstate), s.ToString()));
 }
 
+void WaitUntilCommittedConfigNumMembersIs(int num_members,
+                                          const TServerDetails* replica,
+                                          const std::string& tablet_id,
+                                          const MonoDelta& timeout) {
+  MonoTime deadline = MonoTime::Now() + timeout;
+  AssertEventually([&] {
+    ConsensusStatePB cstate;
+    ASSERT_OK(GetConsensusState(replica, tablet_id, deadline - MonoTime::Now(), &cstate));
+    ASSERT_EQ(num_members, cstate.committed_config().peers_size());
+  }, timeout);
+  NO_PENDING_FATALS();
+}
+
 Status WaitUntilCommittedConfigOpIdIndexIs(int64_t opid_index,
                                            const TServerDetails* replica,
                                            const std::string& tablet_id,

http://git-wip-us.apache.org/repos/asf/kudu/blob/9120cdd1/src/kudu/integration-tests/cluster_itest_util.h
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/cluster_itest_util.h b/src/kudu/integration-tests/cluster_itest_util.h
index 64260f1..215a90c 100644
--- a/src/kudu/integration-tests/cluster_itest_util.h
+++ b/src/kudu/integration-tests/cluster_itest_util.h
@@ -159,12 +159,19 @@ Status WaitUntilNoPendingConfig(const TServerDetails* replica,
                                 consensus::ConsensusStatePB* cstate = nullptr);
 
 // Wait until the number of voters in the committed consensus configuration is
-// 'quorum_size', according to the specified replica.
-Status WaitUntilCommittedConfigNumVotersIs(int config_size,
+// 'num_voters', according to the specified replica.
+Status WaitUntilCommittedConfigNumVotersIs(int num_voters,
                                            const TServerDetails* replica,
                                            const std::string& tablet_id,
                                            const MonoDelta& timeout);
 
+// Wait until the number of voters in the committed consensus configuration is
+// 'num_members', according to the specified replica.
+void WaitUntilCommittedConfigNumMembersIs(int num_members,
+                                          const TServerDetails* replica,
+                                          const std::string& tablet_id,
+                                          const MonoDelta& timeout);
+
 // Wait until the opid_index of the committed consensus config on the
 // specified tablet is 'opid_index'.
 Status WaitUntilCommittedConfigOpIdIndexIs(int64_t opid_index,

http://git-wip-us.apache.org/repos/asf/kudu/blob/9120cdd1/src/kudu/integration-tests/raft_config_change-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_config_change-itest.cc b/src/kudu/integration-tests/raft_config_change-itest.cc
index 8df15a5..4099727 100644
--- a/src/kudu/integration-tests/raft_config_change-itest.cc
+++ b/src/kudu/integration-tests/raft_config_change-itest.cc
@@ -219,7 +219,7 @@ TEST_F(RaftConfigChangeITest, TestNonVoterPromotion) {
   });
 
   // Wait for there to be 4 voters in the config.
-  ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(/*config_size=*/ 4,
+  ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(/*num_voters=*/ 4,
                                                 leader_replica,
                                                 tablet_id,
                                                 kTimeout));

http://git-wip-us.apache.org/repos/asf/kudu/blob/9120cdd1/src/kudu/tools/kudu-admin-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/kudu-admin-test.cc b/src/kudu/tools/kudu-admin-test.cc
index 1506238..7581116 100644
--- a/src/kudu/tools/kudu-admin-test.cc
+++ b/src/kudu/tools/kudu-admin-test.cc
@@ -96,6 +96,13 @@ namespace kudu {
 namespace tools {
 
 class AdminCliTest : public tserver::TabletServerIntegrationTestBase {
+ protected:
+  enum EnableKudu1097 {
+    kDisableKudu1097,
+    kEnableKudu1097
+  };
+
+  void DoTestMoveTablet(EnableKudu1097 enable_kudu_1097);
 };
 
 // Test config change while running a workload.
@@ -211,10 +218,17 @@ TEST_F(AdminCliTest, TestChangeConfig) {
 // 3. Start a workload.
 // 4. Using the CLI, move the 3 replicas around the 5 TS.
 // 5. Profit!
-TEST_F(AdminCliTest, TestMoveTablet) {
+void AdminCliTest::DoTestMoveTablet(EnableKudu1097 enable_kudu_1097) {
+  const string kKudu1097Flag = "--raft_prepare_replacement_before_eviction=true";
+
   FLAGS_num_tablet_servers = 5;
   FLAGS_num_replicas = 3;
-  NO_FATALS(BuildAndStart());
+
+  vector<string> ts_flags, master_flags;
+  if (enable_kudu_1097) {
+    ts_flags = master_flags = { kKudu1097Flag };
+  }
+  NO_FATALS(BuildAndStart(ts_flags, master_flags));
 
   vector<string> tservers;
   AppendKeysFromMap(tablet_servers_, &tservers);
@@ -249,15 +263,28 @@ TEST_F(AdminCliTest, TestMoveTablet) {
   for (int i = 0; i < num_moves; i++) {
     const string remove = active_tservers.front();
     const string add = inactive_tservers.front();
-    ASSERT_OK(RunKuduTool({
+    vector<string> tool_command = {
       "tablet",
       "change_config",
       "move_replica",
+    };
+    vector<string> kudu_1097_args = {
+      "--unlock_experimental_flags",
+      kKudu1097Flag,
+    };
+    vector<string> tool_args = {
       cluster_->master()->bound_rpc_addr().ToString(),
       tablet_id_,
       remove,
-      add
-    }));
+      add,
+    };
+    if (enable_kudu_1097 == kEnableKudu1097) {
+      // Only add these arguments if we running with Kudu 1097 enabled.
+      tool_command.insert(tool_command.end(), kudu_1097_args.begin(), kudu_1097_args.end());
+    }
+    tool_command.insert(tool_command.end(), tool_args.begin(), tool_args.end());
+
+    ASSERT_OK(RunKuduTool(tool_command));
     active_tservers.pop_front();
     active_tservers.push_back(add);
     inactive_tservers.pop_front();
@@ -269,8 +296,13 @@ TEST_F(AdminCliTest, TestMoveTablet) {
     for (const string& uuid : active_tservers) {
       InsertOrDie(&active_tservers_map, uuid, tablet_servers_[uuid]);
     }
-    ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(FLAGS_num_replicas, active_tservers_map[add],
+    ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(/*num_voters=*/ FLAGS_num_replicas,
+                                                  active_tservers_map[add],
                                                   tablet_id_, MonoDelta::FromSeconds(30)));
+    NO_FATALS(WaitUntilCommittedConfigNumMembersIs(/*num_members=*/ FLAGS_num_replicas,
+                                                   active_tservers_map[add],
+                                                   tablet_id_, MonoDelta::FromSeconds(30)));
+
   }
   workload.StopAndJoin();
 
@@ -278,6 +310,14 @@ TEST_F(AdminCliTest, TestMoveTablet) {
   NO_FATALS(v.CheckCluster());
 }
 
+TEST_F(AdminCliTest, TestMoveTablet_pre_KUDU_1097) {
+  DoTestMoveTablet(kDisableKudu1097);
+}
+
+TEST_F(AdminCliTest, TestMoveTablet_KUDU_1097) {
+  DoTestMoveTablet(kEnableKudu1097);
+}
+
 Status RunUnsafeChangeConfig(const string& tablet_id,
                              const string& dst_host,
                              vector<string> peer_uuid_list) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/9120cdd1/src/kudu/tools/tool_action_tablet.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/tool_action_tablet.cc b/src/kudu/tools/tool_action_tablet.cc
index 1a27ecc..c6c9209 100644
--- a/src/kudu/tools/tool_action_tablet.cc
+++ b/src/kudu/tools/tool_action_tablet.cc
@@ -28,6 +28,7 @@
 
 #include <boost/optional/optional.hpp>
 #include <gflags/gflags.h>
+#include <gflags/gflags_declare.h>
 #include <glog/logging.h>
 
 #include "kudu/client/client.h"
@@ -38,6 +39,7 @@
 #include "kudu/consensus/metadata.pb.h"
 #include "kudu/consensus/opid.pb.h"
 #include "kudu/consensus/opid_util.h"
+#include "kudu/gutil/basictypes.h"
 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/stl_util.h"
 #include "kudu/gutil/strings/split.h"
@@ -56,6 +58,7 @@ DEFINE_int64(move_copy_timeout_sec, 600,
              "Number of seconds to wait for tablet copy to complete when relocating a tablet");
 DEFINE_int64(move_leader_timeout_sec, 30,
              "Number of seconds to wait for a leader when relocating a leader tablet");
+DECLARE_bool(raft_prepare_replacement_before_eviction);
 
 namespace kudu {
 namespace tools {
@@ -64,6 +67,8 @@ using client::KuduClient;
 using client::KuduClientBuilder;
 using client::KuduTablet;
 using client::KuduTabletServer;
+using consensus::ADD_PEER;
+using consensus::BulkChangeConfigRequestPB;
 using consensus::ChangeConfigType;
 using consensus::ConsensusServiceProxy;
 using consensus::ConsensusStatePB;
@@ -71,6 +76,7 @@ using consensus::GetConsensusStateRequestPB;
 using consensus::GetConsensusStateResponsePB;
 using consensus::GetLastOpIdRequestPB;
 using consensus::GetLastOpIdResponsePB;
+using consensus::MODIFY_PEER;
 using consensus::OpId;
 using consensus::RaftPeerPB;
 using rpc::RpcController;
@@ -385,32 +391,119 @@ Status MoveReplica(const RunnerContext &context) {
   const string& from_ts_uuid = FindOrDie(context.required_args, kFromTsUuidArg);
   const string& to_ts_uuid = FindOrDie(context.required_args, kToTsUuidArg);
 
-  // Check the tablet is in perfect health and, if so, add the new server.
+  // Check the tablet is in perfect health first.
   RETURN_NOT_OK_PREPEND(DoKsckForTablet(master_addresses, tablet_id),
                         "ksck pre-move health check failed");
-  RETURN_NOT_OK(DoChangeConfig(master_addresses, tablet_id, to_ts_uuid,
-                               RaftPeerPB::VOTER, consensus::ADD_PEER));
 
-  // Wait until the tablet copy completes and the tablet returns to perfect health.
-  MonoDelta copy_timeout = MonoDelta::FromSeconds(FLAGS_move_copy_timeout_sec);
-  RETURN_NOT_OK_PREPEND(WaitForCleanKsck(master_addresses, tablet_id, copy_timeout),
-                        "failed waiting for clean ksck after add server");
+  // The pre- KUDU-1097 way of moving a replica involves first adding a new
+  // replica and then evicting the old one.
+  if (!FLAGS_raft_prepare_replacement_before_eviction) {
+    RETURN_NOT_OK(DoChangeConfig(master_addresses, tablet_id, to_ts_uuid,
+                                RaftPeerPB::VOTER, consensus::ADD_PEER));
+
+    // Wait until the tablet copy completes and the tablet returns to perfect health.
+    MonoDelta copy_timeout = MonoDelta::FromSeconds(FLAGS_move_copy_timeout_sec);
+    RETURN_NOT_OK_PREPEND(WaitForCleanKsck(master_addresses, tablet_id, copy_timeout),
+                          "failed waiting for clean ksck after add server");
+
+    // Finally, remove the chosen replica.
+    // If it is the leader, it will be asked to step down.
+    client::sp::shared_ptr<KuduClient> client;
+    RETURN_NOT_OK(KuduClientBuilder().master_server_addrs(master_addresses).Build(&client));
+    string leader_uuid;
+    HostPort leader_hp;
+    RETURN_NOT_OK(GetTabletLeader(client, tablet_id, &leader_uuid, &leader_hp));
+    if (from_ts_uuid == leader_uuid) {
+      RETURN_NOT_OK_PREPEND(ChangeLeader(client, tablet_id,
+                                        leader_uuid, leader_hp,
+                                        MonoDelta::FromSeconds(FLAGS_move_leader_timeout_sec)),
+                            "failed changing leadership from the replica to be removed");
+    }
+    return DoChangeConfig(master_addresses, tablet_id, from_ts_uuid,
+                          boost::none, consensus::REMOVE_PEER);
+  }
+
+  // In a post- KUDU-1097 world, the procedure to move a replica is to add the
+  // replace=true attribute to the replica to remove while simultaneously
+  // adding the replacement as a non-voter with promote=true.
+  // The following code implements tablet movement in that paradigm.
 
-  // Finally, remove the chosen replica.
-  // If it is the leader, it will be asked to step down.
   client::sp::shared_ptr<KuduClient> client;
-  RETURN_NOT_OK(KuduClientBuilder().master_server_addrs(master_addresses).Build(&client));
+  RETURN_NOT_OK(KuduClientBuilder()
+                .master_server_addrs(master_addresses)
+                .Build(&client));
+
+  BulkChangeConfigRequestPB bulk_req;
+  {
+    auto* change = bulk_req.add_config_changes();
+    change->set_type(MODIFY_PEER);
+    *change->mutable_peer()->mutable_permanent_uuid() = from_ts_uuid;
+    change->mutable_peer()->mutable_attrs()->set_replace(true);
+  }
+  {
+    auto* change = bulk_req.add_config_changes();
+    change->set_type(ADD_PEER);
+    *change->mutable_peer()->mutable_permanent_uuid() = to_ts_uuid;
+    change->mutable_peer()->set_member_type(RaftPeerPB::NON_VOTER);
+    change->mutable_peer()->mutable_attrs()->set_promote(true);
+    HostPort hp;
+    RETURN_NOT_OK(GetRpcAddressForTS(client, to_ts_uuid, &hp));
+    RETURN_NOT_OK(HostPortToPB(hp, change->mutable_peer()->mutable_last_known_addr()));
+  }
+
+  // Find this tablet's leader replica. We need its UUID and RPC address.
   string leader_uuid;
   HostPort leader_hp;
   RETURN_NOT_OK(GetTabletLeader(client, tablet_id, &leader_uuid, &leader_hp));
-  if (from_ts_uuid == leader_uuid) {
-    RETURN_NOT_OK_PREPEND(ChangeLeader(client, tablet_id,
-                                       leader_uuid, leader_hp,
-                                       MonoDelta::FromSeconds(FLAGS_move_leader_timeout_sec)),
-                          "failed changing leadership from the replica to be removed");
+  unique_ptr<ConsensusServiceProxy> proxy;
+  RETURN_NOT_OK(BuildProxy(leader_hp.host(), leader_hp.port(), &proxy));
+
+  BulkChangeConfigRequestPB req;
+  consensus::ChangeConfigResponsePB resp;
+  RpcController rpc;
+  rpc.set_timeout(client->default_admin_operation_timeout());
+  bulk_req.set_dest_uuid(leader_uuid);
+  bulk_req.set_tablet_id(tablet_id);
+  RETURN_NOT_OK(proxy->BulkChangeConfig(bulk_req, &resp, &rpc));
+  if (resp.has_error()) {
+    return StatusFromPB(resp.error().status());
+  }
+
+  // Wait until the tablet copy completes and the tablet returns to perfect health.
+  MonoDelta copy_timeout = MonoDelta::FromSeconds(FLAGS_move_copy_timeout_sec);
+  MonoTime start = MonoTime::Now();
+  MonoTime deadline = start + copy_timeout;
+  while (MonoTime::Now() < deadline) {
+    Status s = DoKsckForTablet(master_addresses, tablet_id);
+    if (s.ok()) {
+      // Get the latest leader info.
+      RETURN_NOT_OK(GetTabletLeader(client, tablet_id, &leader_uuid, &leader_hp));
+      RETURN_NOT_OK(BuildProxy(leader_hp.host(), leader_hp.port(), &proxy));
+
+      // Wait until 'from_ts_uuid' is no longer a member of the config.
+      ConsensusStatePB cstate;
+      RETURN_NOT_OK(GetConsensusState(proxy, tablet_id, leader_uuid,
+                                      client->default_admin_operation_timeout(), &cstate));
+      bool from_ts_uuid_in_config = false; // Is 'from_ts_uuid' still in the config?
+      for (const auto& peer : cstate.committed_config().peers()) {
+        if (peer.permanent_uuid() == from_ts_uuid) {
+          if (peer.attrs().replace() && peer.permanent_uuid() == leader_uuid) {
+            // The leader is the node we intend to remove and is currently
+            // marked for replacement; Make it step down.
+            ignore_result(DoLeaderStepDown(client, tablet_id, leader_uuid, leader_hp));
+          }
+          from_ts_uuid_in_config = true;
+          break;
+        }
+      }
+      if (!from_ts_uuid_in_config) {
+        return Status::OK();
+      }
+    }
+    SleepFor(MonoDelta::FromMilliseconds(50));
   }
-  return DoChangeConfig(master_addresses, tablet_id, from_ts_uuid,
-                        boost::none, consensus::REMOVE_PEER);
+  return Status::TimedOut(Substitute("unable to complete tablet replica move after $0",
+                                     (MonoTime::Now() - start).ToString()));
 }
 
 } // anonymous namespace