You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2017/10/18 22:17:45 UTC
kudu git commit: [itests] fix flake in
TombstonedVoteAfterFailedTabletCopy
Repository: kudu
Updated Branches:
refs/heads/master 20118c7e4 -> bba425e8d
[itests] fix flake in TombstonedVoteAfterFailedTabletCopy
Fixed flakiness in the TombstonedVoteAfterFailedTabletCopy scenario
of the raft_consensus_election-itest. A race between adding a new
replica and DeleteTablet was possible if adding a new server before
the catalog manager acknowledges eviction of the former replica and
sends DeleteTablet request. In other words, prior to this fix,
a delayed DeleteTablet request might be processed _after_ a new
server has been added.
The race was easily reproducible by running the test scenario with
high number of stress-cpu-threads:
./bin/raft_consensus_election-itest \
--stress_cpu_threads=128 \
--gtest_filter=\
'RaftConsensusElectionITest.TombstonedVoteAfterFailedTabletCopy'
Prior to this change, the test failed almost 100% if running as
described above. After the fix, no failures observed in about 100 runs.
Change-Id: Ifaba16d1acfee6ff01e475525482f21ff64133f4
Reviewed-on: http://gerrit.cloudera.org:8080/8298
Reviewed-by: Todd Lipcon <to...@apache.org>
Tested-by: Kudu Jenkins
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/bba425e8
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/bba425e8
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/bba425e8
Branch: refs/heads/master
Commit: bba425e8de0f6fcbd78013960faa327fa0be7b85
Parents: 20118c7
Author: Alexey Serbin <as...@cloudera.com>
Authored: Tue Oct 17 08:21:52 2017 -0700
Committer: Alexey Serbin <as...@cloudera.com>
Committed: Wed Oct 18 22:16:22 2017 +0000
----------------------------------------------------------------------
.../integration-tests/raft_consensus-itest.cc | 3 ++-
.../raft_consensus_election-itest.cc | 21 ++++++++++++++------
2 files changed, 17 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/bba425e8/src/kudu/integration-tests/raft_consensus-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest.cc b/src/kudu/integration-tests/raft_consensus-itest.cc
index d012867..4c45849 100644
--- a/src/kudu/integration-tests/raft_consensus-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest.cc
@@ -113,6 +113,7 @@ using kudu::itest::TServerDetails;
using kudu::itest::TabletServerMap;
using kudu::itest::WAIT_FOR_LEADER;
using kudu::itest::WaitForReplicasReportedToMaster;
+using kudu::itest::WaitUntilCommittedOpIdIndexIs;
using kudu::itest::WaitUntilLeader;
using kudu::itest::WriteSimpleTestRow;
using kudu::master::TabletLocationsPB;
@@ -1708,7 +1709,7 @@ TEST_F(RaftConsensusITest, TestMasterNotifiedOnConfigChange) {
// Wait for initial NO_OP to be committed by the leader.
TServerDetails* leader_ts;
ASSERT_OK(FindTabletLeader(tablet_servers_, tablet_id, timeout, &leader_ts));
- ASSERT_OK(itest::WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id, timeout));
+ ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id, timeout));
// Change the config.
TServerDetails* tserver_to_add = tablet_servers_[uuid_to_add];
http://git-wip-us.apache.org/repos/asf/kudu/blob/bba425e8/src/kudu/integration-tests/raft_consensus_election-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus_election-itest.cc b/src/kudu/integration-tests/raft_consensus_election-itest.cc
index ba4d538..23f5010 100644
--- a/src/kudu/integration-tests/raft_consensus_election-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus_election-itest.cc
@@ -72,6 +72,7 @@ using kudu::itest::WaitUntilLeader;
using kudu::itest::WriteSimpleTestRow;
using kudu::pb_util::SecureShortDebugString;
using kudu::tablet::TABLET_DATA_COPYING;
+using kudu::tablet::TABLET_DATA_TOMBSTONED;
using std::string;
using std::unordered_set;
using std::vector;
@@ -491,11 +492,19 @@ TEST_F(RaftConsensusElectionITest, TombstonedVoteAfterFailedTabletCopy) {
ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(2, tablet_servers_[leader_uuid],
tablet_id_, kTimeout));
ASSERT_EQ(1, active_tablet_servers.erase(follower_uuid));
- ASSERT_OK(WaitForServersToAgree(kTimeout, active_tablet_servers, tablet_id_, 2));
- // Now the follower is evicted and will be tombstoned.
- // We'll add it back to the config and then crash the follower during the
- // resulting tablet copy.
+ // Wait for the deleted tablet to be tombstoned, meaning the catalog manager
+ // has already sent DeleteTablet request and it has been processed. A race
+ // between adding a new server and processing DeleteTablet request is
+ // possible: the DeleteTablet request might be sent/processed _after_
+ // a new server has been added.
+ const int follower_idx =
+ cluster_->tablet_server_index_by_uuid(follower_uuid);
+ ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(
+ follower_idx, tablet_id_, { TABLET_DATA_TOMBSTONED }, kTimeout));
+
+ // Add the evicted follower back to the config and then make it crash because
+ // of injected fault during tablet copy.
ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server_by_uuid(follower_uuid),
"tablet_copy_fault_crash_on_fetch_all", "1.0"));
auto* leader_ts = tablet_servers_[leader_uuid];
@@ -509,8 +518,8 @@ TEST_F(RaftConsensusElectionITest, TombstonedVoteAfterFailedTabletCopy) {
// to vote and the tablet should come online.
cluster_->Shutdown();
- int follower_idx = cluster_->tablet_server_index_by_uuid(follower_uuid);
- ASSERT_OK(inspect_->CheckTabletDataStateOnTS(follower_idx, tablet_id_, { TABLET_DATA_COPYING }));
+ ASSERT_OK(inspect_->CheckTabletDataStateOnTS(follower_idx, tablet_id_,
+ { TABLET_DATA_COPYING }));
ASSERT_OK(cluster_->master()->Restart());
ASSERT_OK(cluster_->tablet_server_by_uuid(leader_uuid)->Restart());