You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2017/12/01 22:49:45 UTC
kudu git commit: KUDU-2230: the leader is always a viable voter
Repository: kudu
Updated Branches:
refs/heads/master 3f4253753 -> dc497fec2
KUDU-2230: the leader is always a viable voter
While iterating over peers in PeerMessageQueue::SafeToEvictUnlocked(),
assume that the leader is always a viable voter. Prior to this patch,
the leader itself was not counted as a viable voter since the leader
does not update its own last_communication_time.
Both test scenarios TestDontEvictIfRemainingConfigIsUnstable_NodesDown
and TestDontEvictIfRemainingConfigIsUnstable_NodesStopped of the
TabletReplacementITest test have been modified to cover the fixed issue.
I verified that commenting out the fix in consensus_queue.cc
makes them fail.
Change-Id: Id200ec8e562e2b21c8eef09e7b38a8d85d23239c
Reviewed-on: http://gerrit.cloudera.org:8080/8709
Reviewed-by: Mike Percy <mp...@apache.org>
Reviewed-by: Andrew Wong <aw...@cloudera.com>
Tested-by: Alexey Serbin <as...@cloudera.com>
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/dc497fec
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/dc497fec
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/dc497fec
Branch: refs/heads/master
Commit: dc497fec2cb1cabec17eb250ca7d8a7e3d6a9e8f
Parents: 3f42537
Author: Alexey Serbin <as...@cloudera.com>
Authored: Thu Nov 30 19:44:22 2017 -0800
Committer: Alexey Serbin <as...@cloudera.com>
Committed: Fri Dec 1 22:47:21 2017 +0000
----------------------------------------------------------------------
src/kudu/consensus/consensus_queue.cc | 37 +++++----
.../tablet_replacement-itest.cc | 82 +++++++++++++++-----
2 files changed, 84 insertions(+), 35 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/dc497fec/src/kudu/consensus/consensus_queue.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_queue.cc b/src/kudu/consensus/consensus_queue.cc
index 008b488..6d09b33 100644
--- a/src/kudu/consensus/consensus_queue.cc
+++ b/src/kudu/consensus/consensus_queue.cc
@@ -401,6 +401,7 @@ OpId PeerMessageQueue::GetNextOpId() const {
bool PeerMessageQueue::SafeToEvictUnlocked(const string& evict_uuid) const {
DCHECK(queue_lock_.is_locked());
+ DCHECK_EQ(LEADER, queue_state_.mode);
auto now = MonoTime::Now();
int remaining_voters = 0;
@@ -418,24 +419,26 @@ bool PeerMessageQueue::SafeToEvictUnlocked(const string& evict_uuid) const {
remaining_voters++;
bool viable = true;
+ // Being alive, the local peer itself (the leader) is always a viable
+ // voter: the criteria below apply only to non-local peers.
+ if (uuid != local_peer_pb_.permanent_uuid()) {
+ // Only consider a peer to be a viable voter if...
+ // ...its last exchange was successful
+ viable &= peer->last_exchange_status == PeerStatus::OK;
+
+ // ...the peer is up to date with the latest majority.
+ //
+ // This indicates that it's actively participating in majorities and likely to
+ // replicate a config change immediately when we propose it.
+ viable &= peer->last_received.index() >= queue_state_.majority_replicated_index;
- // Only consider a peer to be a viable voter if...
- // ...its last exchange was successful
- viable &= peer->last_exchange_status == PeerStatus::OK;
-
- // ...the peer is up to date with the latest majority.
- //
- // This indicates that it's actively participating in majorities and likely to
- // replicate a config change immediately when we propose it.
- viable &= peer->last_received.index() >= queue_state_.majority_replicated_index;
-
- // ...we have communicated successfully with it recently.
- //
- // This handles the case where the tablet has had no recent writes and therefore
- // even a replica that is down would have participated in the latest majority.
- auto unreachable_time = now - peer->last_communication_time;
- viable &= unreachable_time.ToMilliseconds() < FLAGS_consensus_rpc_timeout_ms;
-
+ // ...we have communicated successfully with it recently.
+ //
+ // This handles the case where the tablet has had no recent writes and therefore
+ // even a replica that is down would have participated in the latest majority.
+ auto unreachable_time = now - peer->last_communication_time;
+ viable &= unreachable_time.ToMilliseconds() < FLAGS_consensus_rpc_timeout_ms;
+ }
if (viable) {
remaining_viable_voters++;
}
http://git-wip-us.apache.org/repos/asf/kudu/blob/dc497fec/src/kudu/integration-tests/tablet_replacement-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/tablet_replacement-itest.cc b/src/kudu/integration-tests/tablet_replacement-itest.cc
index eb724f4..c0665b4 100644
--- a/src/kudu/integration-tests/tablet_replacement-itest.cc
+++ b/src/kudu/integration-tests/tablet_replacement-itest.cc
@@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.
+#include <algorithm>
#include <memory>
#include <ostream>
#include <string>
@@ -31,7 +32,6 @@
#include "kudu/common/wire_protocol.h"
#include "kudu/common/wire_protocol.pb.h"
#include "kudu/consensus/metadata.pb.h"
-#include "kudu/gutil/gscoped_ptr.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/integration-tests/cluster_itest_util.h"
#include "kudu/integration-tests/cluster_verifier.h"
@@ -326,14 +326,21 @@ TEST_F(TabletReplacementITest, TestEvictAndReplaceDeadFollower) {
void TabletReplacementITest::TestDontEvictIfRemainingConfigIsUnstable(InstabilityType type) {
if (!AllowSlowTests()) {
- LOG(INFO) << "Skipping test in fast-test mode.";
+ LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
return;
}
- MonoDelta timeout = MonoDelta::FromSeconds(30);
- vector<string> ts_flags = { "--enable_leader_failure_detection=false",
- "--follower_unavailable_considered_failed_sec=5" };
- vector<string> master_flags = { "--catalog_manager_wait_for_new_tablets_to_elect_leader=false" };
+ const MonoDelta kTimeout = MonoDelta::FromSeconds(10);
+ const int kUnavailableSec = 3;
+ const int kConsensusRpcTimeoutSec = 2;
+ const vector<string> ts_flags = {
+ "--enable_leader_failure_detection=false",
+ Substitute("--follower_unavailable_considered_failed_sec=$0", kUnavailableSec),
+ Substitute("--consensus_rpc_timeout_ms=$0", kConsensusRpcTimeoutSec * 1000)
+ };
+ const vector<string> master_flags = {
+ "--catalog_manager_wait_for_new_tablets_to_elect_leader=false",
+ };
NO_FATALS(StartCluster(ts_flags, master_flags));
TestWorkload workload(cluster_.get());
@@ -346,18 +353,21 @@ void TabletReplacementITest::TestDontEvictIfRemainingConfigIsUnstable(Instabilit
// Figure out the tablet id of the created tablet.
vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets;
- ASSERT_OK(itest::WaitForNumTabletsOnTS(leader_ts, 1, timeout, &tablets));
+ ASSERT_OK(itest::WaitForNumTabletsOnTS(leader_ts, 1, kTimeout, &tablets));
string tablet_id = tablets[0].tablet_status().tablet_id();
// Wait until all replicas are up and running.
for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()],
- tablet_id, timeout));
+ tablet_id, kTimeout));
}
// Elect a leader (TS 0)
- ASSERT_OK(itest::StartElection(leader_ts, tablet_id, timeout));
- ASSERT_OK(itest::WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); // Wait for NO_OP.
+ ASSERT_OK(itest::StartElection(leader_ts, tablet_id, kTimeout));
+ ASSERT_OK(itest::WaitForServersToAgree(kTimeout, ts_map_, tablet_id, 1)); // Wait for NO_OP.
+
+ consensus::ConsensusStatePB cstate_initial;
+ ASSERT_OK(GetConsensusState(leader_ts, tablet_id, kTimeout, &cstate_initial));
// Shut down both followers and wait for enough time that the leader thinks they are
// unresponsive. It should not trigger a config change to evict either one.
@@ -372,16 +382,52 @@ void TabletReplacementITest::TestDontEvictIfRemainingConfigIsUnstable(Instabilit
break;
}
- SleepFor(MonoDelta::FromSeconds(10));
- consensus::ConsensusStatePB cstate;
- ASSERT_OK(GetConsensusState(leader_ts, tablet_id, MonoDelta::FromSeconds(10), &cstate));
- SCOPED_TRACE(cstate.DebugString());
- ASSERT_FALSE(cstate.has_pending_config())
- << "Leader should not have issued any config change";
+ // Sleep to make sure the leader replica recognized the stopped/shutdown
+ // followers as unresponsive according to
+ // --follower_unavailable_considered_failed_sec. Since unreachable peers
+ // are not considered viable per PeerMessageQueue::SafeToEvictUnlocked(),
+ // which makes that calculation based on --consensus_rpc_timeout_ms, we also
+ // wait until that timeout expires to proceed. This ensures that later, when
+ // we resume a follower, the leader does not consider itself unreachable,
+ // which was a bug that we had (KUDU-2230) and that this test also serves as
+ // a regression test for.
+ auto min_sleep_required_sec = std::max(kUnavailableSec, kConsensusRpcTimeoutSec);
+ SleepFor(MonoDelta::FromSeconds(2 * min_sleep_required_sec));
+
+ {
+ consensus::ConsensusStatePB cstate;
+ ASSERT_OK(GetConsensusState(leader_ts, tablet_id, kTimeout, &cstate));
+ SCOPED_TRACE(cstate.DebugString());
+ ASSERT_FALSE(cstate.has_pending_config())
+ << "Leader should not have issued any config change";
+ ASSERT_EQ(cstate_initial.committed_config().opid_index(),
+ cstate.committed_config().opid_index())
+ << "Leader should not have issued any config change";
+ }
+
+ switch (type) {
+ case NODE_DOWN:
+ ASSERT_OK(cluster_->tablet_server(kFollower1Index)->Restart());
+ break;
+ case NODE_STOPPED:
+ ASSERT_OK(cluster_->tablet_server(kFollower1Index)->Resume());
+ break;
+ }
+
+ // At this point the majority of voters is back online, so the leader should
+ // evict the failed replica, resulting in Raft configuration update.
+ ASSERT_EVENTUALLY([&] {
+ consensus::ConsensusStatePB cstate;
+ ASSERT_OK(GetConsensusState(leader_ts, tablet_id, kTimeout, &cstate));
+ ASSERT_GT(cstate.committed_config().opid_index(),
+ cstate_initial.committed_config().opid_index())
+ << "Leader should have issued config change to evict failed follower;"
+ << " the consensus state is: " << cstate.DebugString();
+ });
}
-// Regression test for KUDU-2048. If a majority of followers are unresponsive, the
-// leader should not evict any of them.
+// Regression test for KUDU-2048 and KUDU-2230. If a majority of followers are
+// unresponsive, the leader should not evict any of them.
TEST_F(TabletReplacementITest, TestDontEvictIfRemainingConfigIsUnstable_NodesDown) {
TestDontEvictIfRemainingConfigIsUnstable(NODE_DOWN);
}