You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ba...@apache.org on 2021/03/23 16:44:41 UTC
[kudu] branch master updated: [test] KUDU-3266 Fix flakiness in dynamic_multi_master test

This is an automated email from the ASF dual-hosted git repository.

bankim pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new ac09d62  [test] KUDU-3266 Fix flakiness in dynamic_multi_master test
ac09d62 is described below

commit ac09d6205486908df05af4d64dc7618aae7c37bf
Author: Bankim Bhavsar <ba...@cloudera.com>
AuthorDate: Fri Mar 19 16:16:22 2021 -0700

    [test] KUDU-3266 Fix flakiness in dynamic_multi_master test
    
    Flakiness was reported in dynamic_multi_master test after
    the introduction of test for recovering dead master,
    commit 4b4a8c0f2f.
    
    See KUDU-3266 for the analysis.
    
    This change wraps the check for row count under ASSERT_EVENTUALLY
    to ensure the resumed master and the remaining master are given
    a chance to communicate Raft messages and become up to date.
    
    Tests:
    - Reproduced the issue with ASAN build with dist-test.
    - Verified no failures over 100 iterations with the fix
    on ASAN build.
    
    Change-Id: Ifac1d95707064b6ac2624d3f52336d6c39afd3c8
    Reviewed-on: http://gerrit.cloudera.org:8080/17211
    Tested-by: Bankim Bhavsar <ba...@cloudera.com>
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
    Reviewed-by: Mahesh Reddy <mr...@cloudera.com>
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
---
 src/kudu/master/dynamic_multi_master-test.cc | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/kudu/master/dynamic_multi_master-test.cc b/src/kudu/master/dynamic_multi_master-test.cc
index 911e10a..478c38e 100644
--- a/src/kudu/master/dynamic_multi_master-test.cc
+++ b/src/kudu/master/dynamic_multi_master-test.cc
@@ -598,9 +598,19 @@ class DynamicMultiMasterTest : public KuduTest {
       LOG(INFO) << "Pausing and resuming individual masters";
       string table_name = kTableName;
       for (int i = 0; i < expected_num_masters; i++) {
-        ASSERT_OK(migrated_cluster.master(i)->Pause());
-        cluster::ScopedResumeExternalDaemon resume_daemon(migrated_cluster.master(i));
-        NO_FATALS(cv.CheckRowCount(table_name, ClusterVerifier::EXACTLY, 0));
+        auto* master = migrated_cluster.master(i);
+        LOG(INFO) << Substitute("Pausing master $0, $1", master->uuid(),
+                                master->bound_rpc_hostport().ToString());
+        ASSERT_OK(master->Pause());
+        cluster::ScopedResumeExternalDaemon resume_daemon(master);
+
+        // We can run into table not found error in cases where the
+        // previously paused master that's leader of prior term resumes
+        // and the up to date follower doesn't become leader and the resumed
+        // master from previous term isn't up to date. See KUDU-3266 for details.
+        ASSERT_EVENTUALLY([&] {
+          NO_FATALS(cv.CheckRowCount(table_name, ClusterVerifier::EXACTLY, 0));
+        });
 
         // See MasterFailoverTest.TestCreateTableSync to understand why we must
         // check for IsAlreadyPresent as well.