You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2018/08/06 17:02:59 UTC
kudu git commit: [consensus] KUDU-2335 increment term on explicit
step down
Repository: kudu
Updated Branches:
refs/heads/master 1fcce4220 -> f6777db35
[consensus] KUDU-2335 increment term on explicit step down
Prior to this patch, the catalog manager could get a tablet report
from a former leader replica with empty leader UUID and old term.
A dedicated logic in the catalog manager's code (see section 7d(i))
would amend the empty leader UUID to replace it with the previous
leader's UUID. As a result of those shenanigans, catalog manager
would interpret the incoming report as a report from a leader replica
that reports its own health status as UNKNOWN.
The TwoConcurrentRebalancers scenario of the recently introduced
ConcurrentRebalancersTest reproduces the issue pretty often
(about 1 in 100 runs failed), so it was easy to pin-point the problem.
Mike sketched the fix and I ran the new code via dist-test about 1K
times and verified the problem is gone.
As for the test coverage, in addition to the already mentioned
would-be-flaky TwoConcurrentRebalancers scenario, I modified
RaftConsensusElectionITest.LeaderStepDown to reliably catch regressions.
Additionally, I updated the TabletCopyITest.TestRejectRogueLeader
scenario not asking rogue leader to step down: otherwise it would
fail.
Change-Id: I4e1f1446176a78ba04e74dd1153f9048a32d8d5f
Reviewed-on: http://gerrit.cloudera.org:8080/11122
Reviewed-by: Mike Percy <mp...@apache.org>
Tested-by: Alexey Serbin <as...@cloudera.com>
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/f6777db3
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/f6777db3
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/f6777db3
Branch: refs/heads/master
Commit: f6777db35dfb3880ea188990809102d167c6d557
Parents: 1fcce42
Author: Alexey Serbin <as...@cloudera.com>
Authored: Fri Aug 3 15:30:22 2018 -0700
Committer: Alexey Serbin <as...@cloudera.com>
Committed: Mon Aug 6 05:58:08 2018 +0000
----------------------------------------------------------------------
src/kudu/consensus/raft_consensus.cc | 4 +-
.../raft_consensus_election-itest.cc | 41 ++++++++++++++------
src/kudu/integration-tests/tablet_copy-itest.cc | 13 ++-----
3 files changed, 35 insertions(+), 23 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/f6777db3/src/kudu/consensus/raft_consensus.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc
index f81637e..cd58896 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc
@@ -552,8 +552,8 @@ Status RaftConsensus::StepDown(LeaderStepDownResponsePB* resp) {
return Status::OK();
}
LOG_WITH_PREFIX_UNLOCKED(INFO) << "Received request to step down";
- RETURN_NOT_OK(BecomeReplicaUnlocked());
-
+ RETURN_NOT_OK(HandleTermAdvanceUnlocked(CurrentTermUnlocked() + 1,
+ SKIP_FLUSH_TO_DISK));
// Snooze the failure detector for an extra leader failure timeout.
// This should ensure that a different replica is elected leader after this one steps down.
SnoozeFailureDetector(string("explicit stepdown request"), MonoDelta::FromMilliseconds(
http://git-wip-us.apache.org/repos/asf/kudu/blob/f6777db3/src/kudu/integration-tests/raft_consensus_election-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus_election-itest.cc b/src/kudu/integration-tests/raft_consensus_election-itest.cc
index b10294a..3debfb5 100644
--- a/src/kudu/integration-tests/raft_consensus_election-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus_election-itest.cc
@@ -26,6 +26,7 @@
#include <gtest/gtest.h>
#include "kudu/common/wire_protocol.pb.h"
+#include "kudu/consensus/consensus.pb.h"
#include "kudu/consensus/metadata.pb.h"
#include "kudu/gutil/gscoped_ptr.h"
#include "kudu/gutil/map-util.h"
@@ -59,8 +60,10 @@ METRIC_DECLARE_counter(transaction_memory_pressure_rejections);
METRIC_DECLARE_gauge_int64(raft_term);
using kudu::cluster::ExternalTabletServer;
+using kudu::consensus::ConsensusStatePB;
using kudu::consensus::RaftPeerPB;
using kudu::itest::AddServer;
+using kudu::itest::GetConsensusState;
using kudu::itest::GetReplicaStatusAndCheckIfLeader;
using kudu::itest::LeaderStepDown;
using kudu::itest::RemoveServer;
@@ -278,6 +281,7 @@ TEST_F(RaftConsensusElectionITest, AutomaticLeaderElectionOneReplica) {
}
TEST_F(RaftConsensusElectionITest, LeaderStepDown) {
+ const auto kTimeout = MonoDelta::FromSeconds(10);
const vector<string> kTsFlags = {
"--enable_leader_failure_detection=false"
};
@@ -293,25 +297,39 @@ TEST_F(RaftConsensusElectionITest, LeaderStepDown) {
AppendValuesFromMap(tablet_servers_, &tservers);
// Start with no leader.
- Status s = GetReplicaStatusAndCheckIfLeader(tservers[0], tablet_id_, MonoDelta::FromSeconds(10));
+ const auto* ts = tservers[0];
+ Status s = GetReplicaStatusAndCheckIfLeader(ts, tablet_id_, kTimeout);
ASSERT_TRUE(s.IsIllegalState()) << "TS #0 should not be leader yet: " << s.ToString();
// Become leader.
- ASSERT_OK(StartElection(tservers[0], tablet_id_, MonoDelta::FromSeconds(10)));
- ASSERT_OK(WaitUntilLeader(tservers[0], tablet_id_, MonoDelta::FromSeconds(10)));
- ASSERT_OK(WriteSimpleTestRow(tservers[0], tablet_id_, RowOperationsPB::INSERT,
- kTestRowKey, kTestRowIntVal, "foo", MonoDelta::FromSeconds(10)));
- ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), tablet_servers_, tablet_id_, 2));
+ ASSERT_OK(StartElection(ts, tablet_id_, kTimeout));
+ ASSERT_OK(WaitUntilLeader(ts, tablet_id_, kTimeout));
+ ASSERT_OK(WriteSimpleTestRow(ts, tablet_id_, RowOperationsPB::INSERT,
+ kTestRowKey, kTestRowIntVal, "foo", kTimeout));
+ ASSERT_OK(WaitForServersToAgree(kTimeout, tablet_servers_, tablet_id_, 2));
+
+ // Get the Raft term from the newly established leader.
+ ConsensusStatePB cstate_before;
+ ASSERT_OK(GetConsensusState(ts, tablet_id_, kTimeout,
+ consensus::EXCLUDE_HEALTH_REPORT, &cstate_before));
// Step down and test that a 2nd stepdown returns the expected result.
- ASSERT_OK(LeaderStepDown(tservers[0], tablet_id_, MonoDelta::FromSeconds(10)));
+ ASSERT_OK(LeaderStepDown(ts, tablet_id_, kTimeout));
+
+ // Get the Raft term from the leader that has just stepped down.
+ ConsensusStatePB cstate_after;
+ ASSERT_OK(GetConsensusState(ts, tablet_id_, kTimeout,
+ consensus::EXCLUDE_HEALTH_REPORT, &cstate_after));
+ // The stepped-down leader should increment its run-time Raft term.
+ EXPECT_GT(cstate_after.current_term(), cstate_before.current_term());
+
TabletServerErrorPB error;
- s = LeaderStepDown(tservers[0], tablet_id_, MonoDelta::FromSeconds(10), &error);
+ s = LeaderStepDown(ts, tablet_id_, kTimeout, &error);
ASSERT_TRUE(s.IsIllegalState()) << "TS #0 should not be leader anymore: " << s.ToString();
ASSERT_EQ(TabletServerErrorPB::NOT_THE_LEADER, error.code()) << SecureShortDebugString(error);
- s = WriteSimpleTestRow(tservers[0], tablet_id_, RowOperationsPB::INSERT,
- kTestRowKey, kTestRowIntVal, "foo", MonoDelta::FromSeconds(10));
+ s = WriteSimpleTestRow(ts, tablet_id_, RowOperationsPB::INSERT,
+ kTestRowKey, kTestRowIntVal, "foo", kTimeout);
ASSERT_TRUE(s.IsIllegalState()) << "TS #0 should not accept writes as follower: "
<< s.ToString();
}
@@ -517,8 +535,7 @@ TEST_F(RaftConsensusElectionITest, TombstonedVoteAfterFailedTabletCopy) {
"tablet_copy_fault_crash_on_fetch_all", "1.0"));
auto* leader_ts = tablet_servers_[leader_uuid];
auto* follower_ts = tablet_servers_[follower_uuid];
- ASSERT_OK(itest::AddServer(leader_ts, tablet_id_, follower_ts, RaftPeerPB::VOTER,
- kTimeout));
+ ASSERT_OK(AddServer(leader_ts, tablet_id_, follower_ts, RaftPeerPB::VOTER, kTimeout));
ASSERT_OK(cluster_->tablet_server_by_uuid(follower_uuid)->WaitForInjectedCrash(kTimeout));
// Shut down the rest of the cluster, then only bring back the node we
http://git-wip-us.apache.org/repos/asf/kudu/blob/f6777db3/src/kudu/integration-tests/tablet_copy-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/tablet_copy-itest.cc b/src/kudu/integration-tests/tablet_copy-itest.cc
index e8fc265..d4cf644 100644
--- a/src/kudu/integration-tests/tablet_copy-itest.cc
+++ b/src/kudu/integration-tests/tablet_copy-itest.cc
@@ -256,8 +256,7 @@ TEST_F(TabletCopyITest, TestRejectRogueLeader) {
ASSERT_OK(cluster_->tablet_server(zombie_leader_index)->Resume());
// Loop for a few seconds to ensure that the tablet doesn't transition to READY.
- MonoTime deadline = MonoTime::Now();
- deadline.AddDelta(MonoDelta::FromSeconds(5));
+ MonoTime deadline = MonoTime::Now() + MonoDelta::FromSeconds(5);
while (MonoTime::Now() < deadline) {
ASSERT_OK(itest::ListTablets(ts, timeout, &tablets));
ASSERT_EQ(1, tablets.size());
@@ -267,13 +266,9 @@ TEST_F(TabletCopyITest, TestRejectRogueLeader) {
LOG(INFO) << "the rogue leader was not able to tablet copy the tombstoned follower";
- // Force the rogue leader to step down.
- // Then, send a tablet copy start request from a "fake" leader that
- // sends an up-to-date term in the RB request but the actual term stored
- // in the copy source's consensus metadata would still be old.
- LOG(INFO) << "forcing rogue leader T " << tablet_id << " P " << zombie_leader_uuid
- << " to step down...";
- ASSERT_OK(itest::LeaderStepDown(ts_map_[zombie_leader_uuid], tablet_id, timeout));
+ // Send a tablet copy start request from a "fake" leader that
+ // sends an up-to-date term in the tablet copy request but the actual term
+ // stored in the copy source's consensus metadata would still be old.
ExternalTabletServer* zombie_ets = cluster_->tablet_server(zombie_leader_index);
// It's not necessarily part of the API but this could return faliure due to
// rejecting the remote. We intend to make that part async though, so ignoring