You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by aw...@apache.org on 2021/02/06 05:53:56 UTC

[kudu] 01/04: txn_commit-itest: deflake TestCommitTasksReloadOnLeadershipChange

This is an automated email from the ASF dual-hosted git repository.

awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 546e68cfd39e2a6f1b16bd1ddb580d1ebc97c9a4
Author: Andrew Wong <aw...@cloudera.com>
AuthorDate: Fri Feb 5 18:04:23 2021 -0800

    txn_commit-itest: deflake TestCommitTasksReloadOnLeadershipChange
    
    The test shows up on the flaky test dashboard as failing around 20% of
    the time. As it turns out, transferring leadership by quiescing multiple
    replicas can lead to flakiness if we happen to pick a lagging replica as
    the new leader.
    
    Instead of targeting a specific tablet server as the host of the new
    leaders, we'll now just quiesce the old leader tablet server and stop
    quiescing the other tablet servers.
    
    I ran the test in DEBUG mode 100 times. Before this patch, it failed 16
    times; with it, it passed 100/100 times.
    
    Change-Id: I2b27864e72888367eb0af7de59e044a9e018c31b
    Reviewed-on: http://gerrit.cloudera.org:8080/17031
    Tested-by: Kudu Jenkins
    Reviewed-by: Hao Hao <ha...@cloudera.com>
---
 src/kudu/integration-tests/txn_commit-itest.cc | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/kudu/integration-tests/txn_commit-itest.cc b/src/kudu/integration-tests/txn_commit-itest.cc
index 084435e..23fb461 100644
--- a/src/kudu/integration-tests/txn_commit-itest.cc
+++ b/src/kudu/integration-tests/txn_commit-itest.cc
@@ -749,7 +749,6 @@ class ThreeNodeTxnCommitITest : public TxnCommitITest {
       *cluster_->mini_tablet_server(i)->server()->mutable_quiescing() = i != leader_idx;
     }
     leader_ts_ = cluster_->mini_tablet_server(leader_idx);
-    non_leader_ts_ = cluster_->mini_tablet_server(leader_idx + 1);
     // We should have two leaders for our table, and one for the
     // TxnStatusManager.
     ASSERT_EVENTUALLY([&] {
@@ -758,7 +757,6 @@ class ThreeNodeTxnCommitITest : public TxnCommitITest {
   }
  protected:
   MiniTabletServer* leader_ts_;
-  MiniTabletServer* non_leader_ts_;
 };
 
 TEST_F(ThreeNodeTxnCommitITest, TestCommitTasksReloadOnLeadershipChange) {
@@ -776,13 +774,16 @@ TEST_F(ThreeNodeTxnCommitITest, TestCommitTasksReloadOnLeadershipChange) {
   ASSERT_FALSE(is_complete);
 
   FLAGS_txn_schedule_background_tasks = true;
-  // Change our quiescing state and bring the previous leader down so a new
-  // leader can be elected.
-  auto* new_leader_ts = non_leader_ts_;
-  *new_leader_ts->server()->mutable_quiescing() = false;
+  // Change our quiescing states so a new leader can be elected.
   *leader_ts_->server()->mutable_quiescing() = true;
+  for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
+    auto* mts = cluster_->mini_tablet_server(i);
+    if (leader_ts_ != mts) {
+      *mts->server()->mutable_quiescing() = false;
+    }
+  }
   ASSERT_EVENTUALLY([&] {
-    ASSERT_EQ(3, new_leader_ts->server()->num_raft_leaders()->value());
+    ASSERT_EQ(0, leader_ts_->server()->num_raft_leaders()->value());
   });
   // Upon becoming leader, we should have started our commit task and completed
   // the commit.