You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2017/11/29 07:21:23 UTC
[3/3] kudu git commit: KUDU-1097 (patch 5b): kudu tablet
config_change move should use 3-4-3
KUDU-1097 (patch 5b): kudu tablet config_change move should use 3-4-3
This patch adds support for 3-4-3 re-replication to the "replica move"
tool.
Change-Id: I259aeaf24f60a05b88c5c3e0e11d603aa75646a5
Reviewed-on: http://gerrit.cloudera.org:8080/8645
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <as...@cloudera.com>
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/9120cdd1
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/9120cdd1
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/9120cdd1
Branch: refs/heads/master
Commit: 9120cdd1d288ef6b3e03e2cd8445436712a7d4a9
Parents: eb0718e
Author: Mike Percy <mp...@apache.org>
Authored: Mon Nov 27 02:46:24 2017 -0800
Committer: Mike Percy <mp...@apache.org>
Committed: Wed Nov 29 07:20:56 2017 +0000
----------------------------------------------------------------------
.../integration-tests/cluster_itest_util.cc | 22 +++-
src/kudu/integration-tests/cluster_itest_util.h | 11 +-
.../raft_config_change-itest.cc | 2 +-
src/kudu/tools/kudu-admin-test.cc | 52 +++++++-
src/kudu/tools/tool_action_tablet.cc | 127 ++++++++++++++++---
5 files changed, 185 insertions(+), 29 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/9120cdd1/src/kudu/integration-tests/cluster_itest_util.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/cluster_itest_util.cc b/src/kudu/integration-tests/cluster_itest_util.cc
index 757f6d0..089dba1 100644
--- a/src/kudu/integration-tests/cluster_itest_util.cc
+++ b/src/kudu/integration-tests/cluster_itest_util.cc
@@ -23,6 +23,7 @@
#include <boost/optional/optional.hpp>
#include <glog/logging.h>
#include <glog/stl_logging.h>
+#include <gtest/gtest.h>
#include <rapidjson/document.h>
#include "kudu/client/schema.h"
@@ -58,6 +59,8 @@
#include "kudu/util/net/sockaddr.h"
#include "kudu/util/pb_util.h"
#include "kudu/util/status.h"
+#include "kudu/util/test_macros.h"
+#include "kudu/util/test_util.h"
namespace kudu {
namespace itest {
@@ -366,7 +369,7 @@ Status WaitUntilNoPendingConfig(const TServerDetails* replica,
SecureShortDebugString(cstate_tmp), s.ToString()));
}
-Status WaitUntilCommittedConfigNumVotersIs(int config_size,
+Status WaitUntilCommittedConfigNumVotersIs(int num_voters,
const TServerDetails* replica,
const std::string& tablet_id,
const MonoDelta& timeout) {
@@ -381,7 +384,7 @@ Status WaitUntilCommittedConfigNumVotersIs(int config_size,
MonoDelta remaining_timeout = deadline - MonoTime::Now();
s = GetConsensusState(replica, tablet_id, remaining_timeout, &cstate);
if (s.ok()) {
- if (CountVoters(cstate.committed_config()) == config_size) {
+ if (CountVoters(cstate.committed_config()) == num_voters) {
return Status::OK();
}
}
@@ -394,10 +397,23 @@ Status WaitUntilCommittedConfigNumVotersIs(int config_size,
}
return Status::TimedOut(Substitute("Number of voters does not equal $0 after waiting for $1. "
"Last consensus state: $2. Last status: $3",
- config_size, timeout.ToString(),
+ num_voters, timeout.ToString(),
SecureShortDebugString(cstate), s.ToString()));
}
+void WaitUntilCommittedConfigNumMembersIs(int num_members,
+ const TServerDetails* replica,
+ const std::string& tablet_id,
+ const MonoDelta& timeout) {
+ MonoTime deadline = MonoTime::Now() + timeout;
+ AssertEventually([&] {
+ ConsensusStatePB cstate;
+ ASSERT_OK(GetConsensusState(replica, tablet_id, deadline - MonoTime::Now(), &cstate));
+ ASSERT_EQ(num_members, cstate.committed_config().peers_size());
+ }, timeout);
+ NO_PENDING_FATALS();
+}
+
Status WaitUntilCommittedConfigOpIdIndexIs(int64_t opid_index,
const TServerDetails* replica,
const std::string& tablet_id,
http://git-wip-us.apache.org/repos/asf/kudu/blob/9120cdd1/src/kudu/integration-tests/cluster_itest_util.h
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/cluster_itest_util.h b/src/kudu/integration-tests/cluster_itest_util.h
index 64260f1..215a90c 100644
--- a/src/kudu/integration-tests/cluster_itest_util.h
+++ b/src/kudu/integration-tests/cluster_itest_util.h
@@ -159,12 +159,19 @@ Status WaitUntilNoPendingConfig(const TServerDetails* replica,
consensus::ConsensusStatePB* cstate = nullptr);
// Wait until the number of voters in the committed consensus configuration is
-// 'quorum_size', according to the specified replica.
-Status WaitUntilCommittedConfigNumVotersIs(int config_size,
+// 'num_voters', according to the specified replica.
+Status WaitUntilCommittedConfigNumVotersIs(int num_voters,
const TServerDetails* replica,
const std::string& tablet_id,
const MonoDelta& timeout);
+// Wait until the number of voters in the committed consensus configuration is
+// 'num_members', according to the specified replica.
+void WaitUntilCommittedConfigNumMembersIs(int num_members,
+ const TServerDetails* replica,
+ const std::string& tablet_id,
+ const MonoDelta& timeout);
+
// Wait until the opid_index of the committed consensus config on the
// specified tablet is 'opid_index'.
Status WaitUntilCommittedConfigOpIdIndexIs(int64_t opid_index,
http://git-wip-us.apache.org/repos/asf/kudu/blob/9120cdd1/src/kudu/integration-tests/raft_config_change-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_config_change-itest.cc b/src/kudu/integration-tests/raft_config_change-itest.cc
index 8df15a5..4099727 100644
--- a/src/kudu/integration-tests/raft_config_change-itest.cc
+++ b/src/kudu/integration-tests/raft_config_change-itest.cc
@@ -219,7 +219,7 @@ TEST_F(RaftConfigChangeITest, TestNonVoterPromotion) {
});
// Wait for there to be 4 voters in the config.
- ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(/*config_size=*/ 4,
+ ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(/*num_voters=*/ 4,
leader_replica,
tablet_id,
kTimeout));
http://git-wip-us.apache.org/repos/asf/kudu/blob/9120cdd1/src/kudu/tools/kudu-admin-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/kudu-admin-test.cc b/src/kudu/tools/kudu-admin-test.cc
index 1506238..7581116 100644
--- a/src/kudu/tools/kudu-admin-test.cc
+++ b/src/kudu/tools/kudu-admin-test.cc
@@ -96,6 +96,13 @@ namespace kudu {
namespace tools {
class AdminCliTest : public tserver::TabletServerIntegrationTestBase {
+ protected:
+ enum EnableKudu1097 {
+ kDisableKudu1097,
+ kEnableKudu1097
+ };
+
+ void DoTestMoveTablet(EnableKudu1097 enable_kudu_1097);
};
// Test config change while running a workload.
@@ -211,10 +218,17 @@ TEST_F(AdminCliTest, TestChangeConfig) {
// 3. Start a workload.
// 4. Using the CLI, move the 3 replicas around the 5 TS.
// 5. Profit!
-TEST_F(AdminCliTest, TestMoveTablet) {
+void AdminCliTest::DoTestMoveTablet(EnableKudu1097 enable_kudu_1097) {
+ const string kKudu1097Flag = "--raft_prepare_replacement_before_eviction=true";
+
FLAGS_num_tablet_servers = 5;
FLAGS_num_replicas = 3;
- NO_FATALS(BuildAndStart());
+
+ vector<string> ts_flags, master_flags;
+ if (enable_kudu_1097) {
+ ts_flags = master_flags = { kKudu1097Flag };
+ }
+ NO_FATALS(BuildAndStart(ts_flags, master_flags));
vector<string> tservers;
AppendKeysFromMap(tablet_servers_, &tservers);
@@ -249,15 +263,28 @@ TEST_F(AdminCliTest, TestMoveTablet) {
for (int i = 0; i < num_moves; i++) {
const string remove = active_tservers.front();
const string add = inactive_tservers.front();
- ASSERT_OK(RunKuduTool({
+ vector<string> tool_command = {
"tablet",
"change_config",
"move_replica",
+ };
+ vector<string> kudu_1097_args = {
+ "--unlock_experimental_flags",
+ kKudu1097Flag,
+ };
+ vector<string> tool_args = {
cluster_->master()->bound_rpc_addr().ToString(),
tablet_id_,
remove,
- add
- }));
+ add,
+ };
+ if (enable_kudu_1097 == kEnableKudu1097) {
+ // Only add these arguments if we running with Kudu 1097 enabled.
+ tool_command.insert(tool_command.end(), kudu_1097_args.begin(), kudu_1097_args.end());
+ }
+ tool_command.insert(tool_command.end(), tool_args.begin(), tool_args.end());
+
+ ASSERT_OK(RunKuduTool(tool_command));
active_tservers.pop_front();
active_tservers.push_back(add);
inactive_tservers.pop_front();
@@ -269,8 +296,13 @@ TEST_F(AdminCliTest, TestMoveTablet) {
for (const string& uuid : active_tservers) {
InsertOrDie(&active_tservers_map, uuid, tablet_servers_[uuid]);
}
- ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(FLAGS_num_replicas, active_tservers_map[add],
+ ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(/*num_voters=*/ FLAGS_num_replicas,
+ active_tservers_map[add],
tablet_id_, MonoDelta::FromSeconds(30)));
+ NO_FATALS(WaitUntilCommittedConfigNumMembersIs(/*num_members=*/ FLAGS_num_replicas,
+ active_tservers_map[add],
+ tablet_id_, MonoDelta::FromSeconds(30)));
+
}
workload.StopAndJoin();
@@ -278,6 +310,14 @@ TEST_F(AdminCliTest, TestMoveTablet) {
NO_FATALS(v.CheckCluster());
}
+TEST_F(AdminCliTest, TestMoveTablet_pre_KUDU_1097) {
+ DoTestMoveTablet(kDisableKudu1097);
+}
+
+TEST_F(AdminCliTest, TestMoveTablet_KUDU_1097) {
+ DoTestMoveTablet(kEnableKudu1097);
+}
+
Status RunUnsafeChangeConfig(const string& tablet_id,
const string& dst_host,
vector<string> peer_uuid_list) {
http://git-wip-us.apache.org/repos/asf/kudu/blob/9120cdd1/src/kudu/tools/tool_action_tablet.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/tool_action_tablet.cc b/src/kudu/tools/tool_action_tablet.cc
index 1a27ecc..c6c9209 100644
--- a/src/kudu/tools/tool_action_tablet.cc
+++ b/src/kudu/tools/tool_action_tablet.cc
@@ -28,6 +28,7 @@
#include <boost/optional/optional.hpp>
#include <gflags/gflags.h>
+#include <gflags/gflags_declare.h>
#include <glog/logging.h>
#include "kudu/client/client.h"
@@ -38,6 +39,7 @@
#include "kudu/consensus/metadata.pb.h"
#include "kudu/consensus/opid.pb.h"
#include "kudu/consensus/opid_util.h"
+#include "kudu/gutil/basictypes.h"
#include "kudu/gutil/map-util.h"
#include "kudu/gutil/stl_util.h"
#include "kudu/gutil/strings/split.h"
@@ -56,6 +58,7 @@ DEFINE_int64(move_copy_timeout_sec, 600,
"Number of seconds to wait for tablet copy to complete when relocating a tablet");
DEFINE_int64(move_leader_timeout_sec, 30,
"Number of seconds to wait for a leader when relocating a leader tablet");
+DECLARE_bool(raft_prepare_replacement_before_eviction);
namespace kudu {
namespace tools {
@@ -64,6 +67,8 @@ using client::KuduClient;
using client::KuduClientBuilder;
using client::KuduTablet;
using client::KuduTabletServer;
+using consensus::ADD_PEER;
+using consensus::BulkChangeConfigRequestPB;
using consensus::ChangeConfigType;
using consensus::ConsensusServiceProxy;
using consensus::ConsensusStatePB;
@@ -71,6 +76,7 @@ using consensus::GetConsensusStateRequestPB;
using consensus::GetConsensusStateResponsePB;
using consensus::GetLastOpIdRequestPB;
using consensus::GetLastOpIdResponsePB;
+using consensus::MODIFY_PEER;
using consensus::OpId;
using consensus::RaftPeerPB;
using rpc::RpcController;
@@ -385,32 +391,119 @@ Status MoveReplica(const RunnerContext &context) {
const string& from_ts_uuid = FindOrDie(context.required_args, kFromTsUuidArg);
const string& to_ts_uuid = FindOrDie(context.required_args, kToTsUuidArg);
- // Check the tablet is in perfect health and, if so, add the new server.
+ // Check the tablet is in perfect health first.
RETURN_NOT_OK_PREPEND(DoKsckForTablet(master_addresses, tablet_id),
"ksck pre-move health check failed");
- RETURN_NOT_OK(DoChangeConfig(master_addresses, tablet_id, to_ts_uuid,
- RaftPeerPB::VOTER, consensus::ADD_PEER));
- // Wait until the tablet copy completes and the tablet returns to perfect health.
- MonoDelta copy_timeout = MonoDelta::FromSeconds(FLAGS_move_copy_timeout_sec);
- RETURN_NOT_OK_PREPEND(WaitForCleanKsck(master_addresses, tablet_id, copy_timeout),
- "failed waiting for clean ksck after add server");
+ // The pre- KUDU-1097 way of moving a replica involves first adding a new
+ // replica and then evicting the old one.
+ if (!FLAGS_raft_prepare_replacement_before_eviction) {
+ RETURN_NOT_OK(DoChangeConfig(master_addresses, tablet_id, to_ts_uuid,
+ RaftPeerPB::VOTER, consensus::ADD_PEER));
+
+ // Wait until the tablet copy completes and the tablet returns to perfect health.
+ MonoDelta copy_timeout = MonoDelta::FromSeconds(FLAGS_move_copy_timeout_sec);
+ RETURN_NOT_OK_PREPEND(WaitForCleanKsck(master_addresses, tablet_id, copy_timeout),
+ "failed waiting for clean ksck after add server");
+
+ // Finally, remove the chosen replica.
+ // If it is the leader, it will be asked to step down.
+ client::sp::shared_ptr<KuduClient> client;
+ RETURN_NOT_OK(KuduClientBuilder().master_server_addrs(master_addresses).Build(&client));
+ string leader_uuid;
+ HostPort leader_hp;
+ RETURN_NOT_OK(GetTabletLeader(client, tablet_id, &leader_uuid, &leader_hp));
+ if (from_ts_uuid == leader_uuid) {
+ RETURN_NOT_OK_PREPEND(ChangeLeader(client, tablet_id,
+ leader_uuid, leader_hp,
+ MonoDelta::FromSeconds(FLAGS_move_leader_timeout_sec)),
+ "failed changing leadership from the replica to be removed");
+ }
+ return DoChangeConfig(master_addresses, tablet_id, from_ts_uuid,
+ boost::none, consensus::REMOVE_PEER);
+ }
+
+ // In a post- KUDU-1097 world, the procedure to move a replica is to add the
+ // replace=true attribute to the replica to remove while simultaneously
+ // adding the replacement as a non-voter with promote=true.
+ // The following code implements tablet movement in that paradigm.
- // Finally, remove the chosen replica.
- // If it is the leader, it will be asked to step down.
client::sp::shared_ptr<KuduClient> client;
- RETURN_NOT_OK(KuduClientBuilder().master_server_addrs(master_addresses).Build(&client));
+ RETURN_NOT_OK(KuduClientBuilder()
+ .master_server_addrs(master_addresses)
+ .Build(&client));
+
+ BulkChangeConfigRequestPB bulk_req;
+ {
+ auto* change = bulk_req.add_config_changes();
+ change->set_type(MODIFY_PEER);
+ *change->mutable_peer()->mutable_permanent_uuid() = from_ts_uuid;
+ change->mutable_peer()->mutable_attrs()->set_replace(true);
+ }
+ {
+ auto* change = bulk_req.add_config_changes();
+ change->set_type(ADD_PEER);
+ *change->mutable_peer()->mutable_permanent_uuid() = to_ts_uuid;
+ change->mutable_peer()->set_member_type(RaftPeerPB::NON_VOTER);
+ change->mutable_peer()->mutable_attrs()->set_promote(true);
+ HostPort hp;
+ RETURN_NOT_OK(GetRpcAddressForTS(client, to_ts_uuid, &hp));
+ RETURN_NOT_OK(HostPortToPB(hp, change->mutable_peer()->mutable_last_known_addr()));
+ }
+
+ // Find this tablet's leader replica. We need its UUID and RPC address.
string leader_uuid;
HostPort leader_hp;
RETURN_NOT_OK(GetTabletLeader(client, tablet_id, &leader_uuid, &leader_hp));
- if (from_ts_uuid == leader_uuid) {
- RETURN_NOT_OK_PREPEND(ChangeLeader(client, tablet_id,
- leader_uuid, leader_hp,
- MonoDelta::FromSeconds(FLAGS_move_leader_timeout_sec)),
- "failed changing leadership from the replica to be removed");
+ unique_ptr<ConsensusServiceProxy> proxy;
+ RETURN_NOT_OK(BuildProxy(leader_hp.host(), leader_hp.port(), &proxy));
+
+ BulkChangeConfigRequestPB req;
+ consensus::ChangeConfigResponsePB resp;
+ RpcController rpc;
+ rpc.set_timeout(client->default_admin_operation_timeout());
+ bulk_req.set_dest_uuid(leader_uuid);
+ bulk_req.set_tablet_id(tablet_id);
+ RETURN_NOT_OK(proxy->BulkChangeConfig(bulk_req, &resp, &rpc));
+ if (resp.has_error()) {
+ return StatusFromPB(resp.error().status());
+ }
+
+ // Wait until the tablet copy completes and the tablet returns to perfect health.
+ MonoDelta copy_timeout = MonoDelta::FromSeconds(FLAGS_move_copy_timeout_sec);
+ MonoTime start = MonoTime::Now();
+ MonoTime deadline = start + copy_timeout;
+ while (MonoTime::Now() < deadline) {
+ Status s = DoKsckForTablet(master_addresses, tablet_id);
+ if (s.ok()) {
+ // Get the latest leader info.
+ RETURN_NOT_OK(GetTabletLeader(client, tablet_id, &leader_uuid, &leader_hp));
+ RETURN_NOT_OK(BuildProxy(leader_hp.host(), leader_hp.port(), &proxy));
+
+ // Wait until 'from_ts_uuid' is no longer a member of the config.
+ ConsensusStatePB cstate;
+ RETURN_NOT_OK(GetConsensusState(proxy, tablet_id, leader_uuid,
+ client->default_admin_operation_timeout(), &cstate));
+ bool from_ts_uuid_in_config = false; // Is 'from_ts_uuid' still in the config?
+ for (const auto& peer : cstate.committed_config().peers()) {
+ if (peer.permanent_uuid() == from_ts_uuid) {
+ if (peer.attrs().replace() && peer.permanent_uuid() == leader_uuid) {
+ // The leader is the node we intend to remove and is currently
+ // marked for replacement; Make it step down.
+ ignore_result(DoLeaderStepDown(client, tablet_id, leader_uuid, leader_hp));
+ }
+ from_ts_uuid_in_config = true;
+ break;
+ }
+ }
+ if (!from_ts_uuid_in_config) {
+ return Status::OK();
+ }
+ }
+ SleepFor(MonoDelta::FromMilliseconds(50));
}
- return DoChangeConfig(master_addresses, tablet_id, from_ts_uuid,
- boost::none, consensus::REMOVE_PEER);
+ return Status::TimedOut(Substitute("unable to complete tablet replica move after $0",
+ (MonoTime::Now() - start).ToString()));
}
} // anonymous namespace