You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2023/04/24 18:52:50 UTC

[kudu] branch master updated: [tests] fix flakiness in TestNoMoreRetryWithWongServerUuid

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 62a12611c [tests] fix flakiness in TestNoMoreRetryWithWongServerUuid
62a12611c is described below

commit 62a12611c29f40a21972a23b9800667ec0bf9fc0
Author: Alexey Serbin <al...@apache.org>
AuthorDate: Fri Apr 21 20:16:01 2023 -0700

    [tests] fix flakiness in TestNoMoreRetryWithWongServerUuid
    
    The TestNoMoreRetryWithWongServerUuid scenario of the DeleteTabletITest
    was a bit flaky.  The system catalog was sometimes sending in
    DeleteTablet RPCs when the original tablet server was still starting
    up, so even if DeleteTablet requests went through and were accounted
    for, they were responded with an error status because the tablet
    manager of the original tablet server wasn't ready to receive such
    requests yet, and the catalog manager retried its requests after
    receiving such responses.
    
    This patch addresses the flakiness and fixes a typo in the name of the
    scenario.  The scenario's dist-test stats for a DEBUG build are below.
    
    Before:
      6 out of 1024 failed:
        http://dist-test.cloudera.org/job?job_id=aserbin.1682129573.9602
    
    After:
      0 out of 1024 failed:
        http://dist-test.cloudera.org/job?job_id=aserbin.1682133036.42291
    
    Change-Id: I3d3dee89b32d1e33d1f0f41e8b83835b02eae336
    Reviewed-on: http://gerrit.cloudera.org:8080/19785
    Reviewed-by: Abhishek Chennaka <ac...@cloudera.com>
    Tested-by: Alexey Serbin <al...@apache.org>
---
 src/kudu/integration-tests/delete_tablet-itest.cc | 44 +++++++++++++++++------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/src/kudu/integration-tests/delete_tablet-itest.cc b/src/kudu/integration-tests/delete_tablet-itest.cc
index 9b58c4e4b..be8c4b9e7 100644
--- a/src/kudu/integration-tests/delete_tablet-itest.cc
+++ b/src/kudu/integration-tests/delete_tablet-itest.cc
@@ -238,14 +238,15 @@ TEST_F(DeleteTabletITest, TestNoOpDeleteTabletRPC) {
   ASSERT_EQ(0, flush_count_after - flush_count_before);
 }
 
-// Regression test for KUDU-3341: Ensure that master would not retry to send
+// Regression test for KUDU-3341: ensure that master would not retry sending
 // DeleteTablet() RPC to a "wrong server".
-TEST_F(DeleteTabletITest, TestNoMoreRetryWithWongServerUuid) {
+TEST_F(DeleteTabletITest, NoMoreRetryWithWrongServerUuid) {
   SKIP_IF_SLOW_NOT_ALLOWED();
+  constexpr int kNumTablets = 3;
+  constexpr int kNumTabletServers = 4;
+
   FLAGS_raft_heartbeat_interval_ms = 100;
   FLAGS_follower_unavailable_considered_failed_sec = 2;
-  const int kNumTablets = 3;
-  const int kNumTabletServers = 4;
 
   // Start a cluster and wait all tablets running and leader elected.
   NO_FATALS(StartCluster(kNumTabletServers));
@@ -253,8 +254,7 @@ TEST_F(DeleteTabletITest, TestNoMoreRetryWithWongServerUuid) {
   workload.set_num_tablets(kNumTablets);
   workload.Setup();
   ASSERT_EVENTUALLY([&] {
-    ClusterVerifier v(cluster_.get());
-    ASSERT_OK(v.RunKsck());
+    ASSERT_OK(ClusterVerifier(cluster_.get()).RunKsck());
   });
 
   // Get number of replicas on ts-0.
@@ -267,7 +267,7 @@ TEST_F(DeleteTabletITest, TestNoMoreRetryWithWongServerUuid) {
   }
 
   // Stop ts-0 and wait for replacement of replicas finished.
-  Sockaddr addr = ts->bound_rpc_addr();
+  const Sockaddr addr = ts->bound_rpc_addr();
   ts->Shutdown();
   SleepFor(MonoDelta::FromSeconds(2 * FLAGS_follower_unavailable_considered_failed_sec));
 
@@ -275,24 +275,46 @@ TEST_F(DeleteTabletITest, TestNoMoreRetryWithWongServerUuid) {
   ASSERT_OK(cluster_->AddTabletServer(HostPort(addr)));
 
   auto* new_ts = cluster_->mini_tablet_server(kNumTabletServers);
+
   int64_t num_delete_tablet_rpcs = 0;
   ASSERT_EVENTUALLY([&] {
     ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(new_ts->bound_http_addr()), &num_delete_tablet_rpcs));
     ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
   });
-  // Sleep enough time to verify no additional DeleteTablet RPCs are sent by master.
+  // Sleep for some time and verify that no additional DeleteTablet RPCs
+  // are sent by the system catalog.
   SleepFor(MonoDelta::FromSeconds(5));
+  ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(new_ts->bound_http_addr()), &num_delete_tablet_rpcs));
   ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
 
-  // Stop the new tablet server and restart ts-0, finally outdated tablets on ts-0 would be deleted.
+  // Stop the new tablet server and start ts-0 back. The outdated tablet
+  // replicas on ts-0 should be deleted.
   new_ts->Shutdown();
+
+  // Shutdown the system catalog before starting up ts-0. That's to avoid
+  // receiving DeleteTablet calls when tablet server isn't yet able to process
+  // them, but it still counts them in the metric that GetNumDeleteTabletRPCs()
+  // retrieves the stats from.
+  for (auto i = 0; i < cluster_->num_masters(); ++i) {
+    cluster_->mini_master(i)->Shutdown();
+  }
+
   ASSERT_OK(ts->Start());
+
+  for (auto i = 0; i < cluster_->num_masters(); ++i) {
+    ASSERT_OK(cluster_->mini_master(i)->Restart());
+  }
+
   ASSERT_EVENTUALLY([&] {
     ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(ts->bound_http_addr()), &num_delete_tablet_rpcs));
     ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
-    int num_live_tablets = ts->server()->tablet_manager()->GetNumLiveTablets();
-    ASSERT_EQ(0, num_live_tablets);
+    ASSERT_EQ(0, ts->server()->tablet_manager()->GetNumLiveTablets());
   });
+  // Sleep for some time and verify that no additional DeleteTablet RPCs
+  // are sent by the system catalog.
+  SleepFor(MonoDelta::FromSeconds(5));
+  ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(ts->bound_http_addr()), &num_delete_tablet_rpcs));
+  ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
 }
 
 } // namespace kudu