You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2018/08/06 17:02:59 UTC

kudu git commit: [consensus] KUDU-2335 increment term on explicit step down

Repository: kudu
Updated Branches:
  refs/heads/master 1fcce4220 -> f6777db35


[consensus] KUDU-2335 increment term on explicit step down

Prior to this patch, the catalog manager could get a tablet report
from a former leader replica with empty leader UUID and old term.
A dedicated logic in the catalog manager's code (see section 7d(i))
would amend the empty leader UUID to replace it with the previous
leader's UUID.  As a result of those shenanigans, catalog manager
would interpret the incoming report as a report from a leader replica
that reports its own health status as UNKNOWN.

The TwoConcurrentRebalancers scenario of the recently introduced
ConcurrentRebalancersTest reproduces the issue pretty often
(about 1 in 100 runs failed), so it was easy to pin-point the problem.
Mike sketched the fix and I ran the new code via dist-test about 1K
times and verified the problem is gone.

As for the test coverage, in addition to the already mentioned
would-be-flaky TwoConcurrentRebalancers scenario, I modified
RaftConsensusElectionITest.LeaderStepDown to reliably catch regressions.

Additionally, I updated the TabletCopyITest.TestRejectRogueLeader
scenario not asking rogue leader to step down: otherwise it would
fail.

Change-Id: I4e1f1446176a78ba04e74dd1153f9048a32d8d5f
Reviewed-on: http://gerrit.cloudera.org:8080/11122
Reviewed-by: Mike Percy <mp...@apache.org>
Tested-by: Alexey Serbin <as...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/f6777db3
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/f6777db3
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/f6777db3

Branch: refs/heads/master
Commit: f6777db35dfb3880ea188990809102d167c6d557
Parents: 1fcce42
Author: Alexey Serbin <as...@cloudera.com>
Authored: Fri Aug 3 15:30:22 2018 -0700
Committer: Alexey Serbin <as...@cloudera.com>
Committed: Mon Aug 6 05:58:08 2018 +0000

----------------------------------------------------------------------
 src/kudu/consensus/raft_consensus.cc            |  4 +-
 .../raft_consensus_election-itest.cc            | 41 ++++++++++++++------
 src/kudu/integration-tests/tablet_copy-itest.cc | 13 ++-----
 3 files changed, 35 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/f6777db3/src/kudu/consensus/raft_consensus.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc
index f81637e..cd58896 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc
@@ -552,8 +552,8 @@ Status RaftConsensus::StepDown(LeaderStepDownResponsePB* resp) {
     return Status::OK();
   }
   LOG_WITH_PREFIX_UNLOCKED(INFO) << "Received request to step down";
-  RETURN_NOT_OK(BecomeReplicaUnlocked());
-
+  RETURN_NOT_OK(HandleTermAdvanceUnlocked(CurrentTermUnlocked() + 1,
+                                          SKIP_FLUSH_TO_DISK));
   // Snooze the failure detector for an extra leader failure timeout.
   // This should ensure that a different replica is elected leader after this one steps down.
   SnoozeFailureDetector(string("explicit stepdown request"), MonoDelta::FromMilliseconds(

http://git-wip-us.apache.org/repos/asf/kudu/blob/f6777db3/src/kudu/integration-tests/raft_consensus_election-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus_election-itest.cc b/src/kudu/integration-tests/raft_consensus_election-itest.cc
index b10294a..3debfb5 100644
--- a/src/kudu/integration-tests/raft_consensus_election-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus_election-itest.cc
@@ -26,6 +26,7 @@
 #include <gtest/gtest.h>
 
 #include "kudu/common/wire_protocol.pb.h"
+#include "kudu/consensus/consensus.pb.h"
 #include "kudu/consensus/metadata.pb.h"
 #include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/map-util.h"
@@ -59,8 +60,10 @@ METRIC_DECLARE_counter(transaction_memory_pressure_rejections);
 METRIC_DECLARE_gauge_int64(raft_term);
 
 using kudu::cluster::ExternalTabletServer;
+using kudu::consensus::ConsensusStatePB;
 using kudu::consensus::RaftPeerPB;
 using kudu::itest::AddServer;
+using kudu::itest::GetConsensusState;
 using kudu::itest::GetReplicaStatusAndCheckIfLeader;
 using kudu::itest::LeaderStepDown;
 using kudu::itest::RemoveServer;
@@ -278,6 +281,7 @@ TEST_F(RaftConsensusElectionITest, AutomaticLeaderElectionOneReplica) {
 }
 
 TEST_F(RaftConsensusElectionITest, LeaderStepDown) {
+  const auto kTimeout = MonoDelta::FromSeconds(10);
   const vector<string> kTsFlags = {
     "--enable_leader_failure_detection=false"
   };
@@ -293,25 +297,39 @@ TEST_F(RaftConsensusElectionITest, LeaderStepDown) {
   AppendValuesFromMap(tablet_servers_, &tservers);
 
   // Start with no leader.
-  Status s = GetReplicaStatusAndCheckIfLeader(tservers[0], tablet_id_, MonoDelta::FromSeconds(10));
+  const auto* ts = tservers[0];
+  Status s = GetReplicaStatusAndCheckIfLeader(ts, tablet_id_, kTimeout);
   ASSERT_TRUE(s.IsIllegalState()) << "TS #0 should not be leader yet: " << s.ToString();
 
   // Become leader.
-  ASSERT_OK(StartElection(tservers[0], tablet_id_, MonoDelta::FromSeconds(10)));
-  ASSERT_OK(WaitUntilLeader(tservers[0], tablet_id_, MonoDelta::FromSeconds(10)));
-  ASSERT_OK(WriteSimpleTestRow(tservers[0], tablet_id_, RowOperationsPB::INSERT,
-                               kTestRowKey, kTestRowIntVal, "foo", MonoDelta::FromSeconds(10)));
-  ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), tablet_servers_, tablet_id_, 2));
+  ASSERT_OK(StartElection(ts, tablet_id_, kTimeout));
+  ASSERT_OK(WaitUntilLeader(ts, tablet_id_, kTimeout));
+  ASSERT_OK(WriteSimpleTestRow(ts, tablet_id_, RowOperationsPB::INSERT,
+                               kTestRowKey, kTestRowIntVal, "foo", kTimeout));
+  ASSERT_OK(WaitForServersToAgree(kTimeout, tablet_servers_, tablet_id_, 2));
+
+  // Get the Raft term from the newly established leader.
+  ConsensusStatePB cstate_before;
+  ASSERT_OK(GetConsensusState(ts, tablet_id_, kTimeout,
+                              consensus::EXCLUDE_HEALTH_REPORT, &cstate_before));
 
   // Step down and test that a 2nd stepdown returns the expected result.
-  ASSERT_OK(LeaderStepDown(tservers[0], tablet_id_, MonoDelta::FromSeconds(10)));
+  ASSERT_OK(LeaderStepDown(ts, tablet_id_, kTimeout));
+
+  // Get the Raft term from the leader that has just stepped down.
+  ConsensusStatePB cstate_after;
+  ASSERT_OK(GetConsensusState(ts, tablet_id_, kTimeout,
+                              consensus::EXCLUDE_HEALTH_REPORT, &cstate_after));
+  // The stepped-down leader should increment its run-time Raft term.
+  EXPECT_GT(cstate_after.current_term(), cstate_before.current_term());
+
   TabletServerErrorPB error;
-  s = LeaderStepDown(tservers[0], tablet_id_, MonoDelta::FromSeconds(10), &error);
+  s = LeaderStepDown(ts, tablet_id_, kTimeout, &error);
   ASSERT_TRUE(s.IsIllegalState()) << "TS #0 should not be leader anymore: " << s.ToString();
   ASSERT_EQ(TabletServerErrorPB::NOT_THE_LEADER, error.code()) << SecureShortDebugString(error);
 
-  s = WriteSimpleTestRow(tservers[0], tablet_id_, RowOperationsPB::INSERT,
-                         kTestRowKey, kTestRowIntVal, "foo", MonoDelta::FromSeconds(10));
+  s = WriteSimpleTestRow(ts, tablet_id_, RowOperationsPB::INSERT,
+                         kTestRowKey, kTestRowIntVal, "foo", kTimeout);
   ASSERT_TRUE(s.IsIllegalState()) << "TS #0 should not accept writes as follower: "
                                   << s.ToString();
 }
@@ -517,8 +535,7 @@ TEST_F(RaftConsensusElectionITest, TombstonedVoteAfterFailedTabletCopy) {
             "tablet_copy_fault_crash_on_fetch_all", "1.0"));
   auto* leader_ts = tablet_servers_[leader_uuid];
   auto* follower_ts = tablet_servers_[follower_uuid];
-  ASSERT_OK(itest::AddServer(leader_ts, tablet_id_, follower_ts, RaftPeerPB::VOTER,
-                             kTimeout));
+  ASSERT_OK(AddServer(leader_ts, tablet_id_, follower_ts, RaftPeerPB::VOTER, kTimeout));
   ASSERT_OK(cluster_->tablet_server_by_uuid(follower_uuid)->WaitForInjectedCrash(kTimeout));
 
   // Shut down the rest of the cluster, then only bring back the node we

http://git-wip-us.apache.org/repos/asf/kudu/blob/f6777db3/src/kudu/integration-tests/tablet_copy-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/tablet_copy-itest.cc b/src/kudu/integration-tests/tablet_copy-itest.cc
index e8fc265..d4cf644 100644
--- a/src/kudu/integration-tests/tablet_copy-itest.cc
+++ b/src/kudu/integration-tests/tablet_copy-itest.cc
@@ -256,8 +256,7 @@ TEST_F(TabletCopyITest, TestRejectRogueLeader) {
   ASSERT_OK(cluster_->tablet_server(zombie_leader_index)->Resume());
 
   // Loop for a few seconds to ensure that the tablet doesn't transition to READY.
-  MonoTime deadline = MonoTime::Now();
-  deadline.AddDelta(MonoDelta::FromSeconds(5));
+  MonoTime deadline = MonoTime::Now() + MonoDelta::FromSeconds(5);
   while (MonoTime::Now() < deadline) {
     ASSERT_OK(itest::ListTablets(ts, timeout, &tablets));
     ASSERT_EQ(1, tablets.size());
@@ -267,13 +266,9 @@ TEST_F(TabletCopyITest, TestRejectRogueLeader) {
 
   LOG(INFO) << "the rogue leader was not able to tablet copy the tombstoned follower";
 
-  // Force the rogue leader to step down.
-  // Then, send a tablet copy start request from a "fake" leader that
-  // sends an up-to-date term in the RB request but the actual term stored
-  // in the copy source's consensus metadata would still be old.
-  LOG(INFO) << "forcing rogue leader T " << tablet_id << " P " << zombie_leader_uuid
-            << " to step down...";
-  ASSERT_OK(itest::LeaderStepDown(ts_map_[zombie_leader_uuid], tablet_id, timeout));
+  // Send a tablet copy start request from a "fake" leader that
+  // sends an up-to-date term in the tablet copy request but the actual term
+  // stored in the copy source's consensus metadata would still be old.
   ExternalTabletServer* zombie_ets = cluster_->tablet_server(zombie_leader_index);
   // It's not necessarily part of the API but this could return faliure due to
   // rejecting the remote. We intend to make that part async though, so ignoring