You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2017/10/18 22:17:45 UTC

kudu git commit: [itests] fix flake in TombstonedVoteAfterFailedTabletCopy

Repository: kudu
Updated Branches:
  refs/heads/master 20118c7e4 -> bba425e8d


[itests] fix flake in TombstonedVoteAfterFailedTabletCopy

Fixed flakiness in the TombstonedVoteAfterFailedTabletCopy scenario
of the raft_consensus_election-itest.  A race between adding a new
replica and DeleteTablet was possible if adding a new server before
the catalog manager acknowledges eviction of the former replica and
sends DeleteTablet request.  In other words, prior to this fix,
a delayed DeleteTablet request might be processed _after_ a new
server has been added.

The race was easily reproducible by running the test scenario with
high number of stress-cpu-threads:

  ./bin/raft_consensus_election-itest \
    --stress_cpu_threads=128 \
    --gtest_filter=\
      'RaftConsensusElectionITest.TombstonedVoteAfterFailedTabletCopy'

Prior to this change, the test failed almost 100% if running as
described above. After the fix, no failures observed in about 100 runs.

Change-Id: Ifaba16d1acfee6ff01e475525482f21ff64133f4
Reviewed-on: http://gerrit.cloudera.org:8080/8298
Reviewed-by: Todd Lipcon <to...@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/bba425e8
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/bba425e8
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/bba425e8

Branch: refs/heads/master
Commit: bba425e8de0f6fcbd78013960faa327fa0be7b85
Parents: 20118c7
Author: Alexey Serbin <as...@cloudera.com>
Authored: Tue Oct 17 08:21:52 2017 -0700
Committer: Alexey Serbin <as...@cloudera.com>
Committed: Wed Oct 18 22:16:22 2017 +0000

----------------------------------------------------------------------
 .../integration-tests/raft_consensus-itest.cc   |  3 ++-
 .../raft_consensus_election-itest.cc            | 21 ++++++++++++++------
 2 files changed, 17 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/bba425e8/src/kudu/integration-tests/raft_consensus-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest.cc b/src/kudu/integration-tests/raft_consensus-itest.cc
index d012867..4c45849 100644
--- a/src/kudu/integration-tests/raft_consensus-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest.cc
@@ -113,6 +113,7 @@ using kudu::itest::TServerDetails;
 using kudu::itest::TabletServerMap;
 using kudu::itest::WAIT_FOR_LEADER;
 using kudu::itest::WaitForReplicasReportedToMaster;
+using kudu::itest::WaitUntilCommittedOpIdIndexIs;
 using kudu::itest::WaitUntilLeader;
 using kudu::itest::WriteSimpleTestRow;
 using kudu::master::TabletLocationsPB;
@@ -1708,7 +1709,7 @@ TEST_F(RaftConsensusITest, TestMasterNotifiedOnConfigChange) {
   // Wait for initial NO_OP to be committed by the leader.
   TServerDetails* leader_ts;
   ASSERT_OK(FindTabletLeader(tablet_servers_, tablet_id, timeout, &leader_ts));
-  ASSERT_OK(itest::WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id, timeout));
+  ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id, timeout));
 
   // Change the config.
   TServerDetails* tserver_to_add = tablet_servers_[uuid_to_add];

http://git-wip-us.apache.org/repos/asf/kudu/blob/bba425e8/src/kudu/integration-tests/raft_consensus_election-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus_election-itest.cc b/src/kudu/integration-tests/raft_consensus_election-itest.cc
index ba4d538..23f5010 100644
--- a/src/kudu/integration-tests/raft_consensus_election-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus_election-itest.cc
@@ -72,6 +72,7 @@ using kudu::itest::WaitUntilLeader;
 using kudu::itest::WriteSimpleTestRow;
 using kudu::pb_util::SecureShortDebugString;
 using kudu::tablet::TABLET_DATA_COPYING;
+using kudu::tablet::TABLET_DATA_TOMBSTONED;
 using std::string;
 using std::unordered_set;
 using std::vector;
@@ -491,11 +492,19 @@ TEST_F(RaftConsensusElectionITest, TombstonedVoteAfterFailedTabletCopy) {
   ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(2, tablet_servers_[leader_uuid],
                                                 tablet_id_, kTimeout));
   ASSERT_EQ(1, active_tablet_servers.erase(follower_uuid));
-  ASSERT_OK(WaitForServersToAgree(kTimeout, active_tablet_servers, tablet_id_, 2));
 
-  // Now the follower is evicted and will be tombstoned.
-  // We'll add it back to the config and then crash the follower during the
-  // resulting tablet copy.
+  // Wait for the deleted tablet to be tombstoned, meaning the catalog manager
+  // has already sent DeleteTablet request and it has been processed. A race
+  // between adding a new server and processing DeleteTablet request is
+  // possible: the DeleteTablet request might be sent/processed _after_
+  // a new server has been added.
+  const int follower_idx =
+      cluster_->tablet_server_index_by_uuid(follower_uuid);
+  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(
+      follower_idx, tablet_id_, { TABLET_DATA_TOMBSTONED }, kTimeout));
+
+  // Add the evicted follower back to the config and then make it crash because
+  // of injected fault during tablet copy.
   ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server_by_uuid(follower_uuid),
             "tablet_copy_fault_crash_on_fetch_all", "1.0"));
   auto* leader_ts = tablet_servers_[leader_uuid];
@@ -509,8 +518,8 @@ TEST_F(RaftConsensusElectionITest, TombstonedVoteAfterFailedTabletCopy) {
   // to vote and the tablet should come online.
   cluster_->Shutdown();
 
-  int follower_idx = cluster_->tablet_server_index_by_uuid(follower_uuid);
-  ASSERT_OK(inspect_->CheckTabletDataStateOnTS(follower_idx, tablet_id_, { TABLET_DATA_COPYING }));
+  ASSERT_OK(inspect_->CheckTabletDataStateOnTS(follower_idx, tablet_id_,
+                                               { TABLET_DATA_COPYING }));
 
   ASSERT_OK(cluster_->master()->Restart());
   ASSERT_OK(cluster_->tablet_server_by_uuid(leader_uuid)->Restart());