You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2023/04/24 18:52:50 UTC
[kudu] branch master updated: [tests] fix flakiness in TestNoMoreRetryWithWongServerUuid
This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 62a12611c [tests] fix flakiness in TestNoMoreRetryWithWongServerUuid
62a12611c is described below
commit 62a12611c29f40a21972a23b9800667ec0bf9fc0
Author: Alexey Serbin <al...@apache.org>
AuthorDate: Fri Apr 21 20:16:01 2023 -0700
[tests] fix flakiness in TestNoMoreRetryWithWongServerUuid
The TestNoMoreRetryWithWongServerUuid scenario of the DeleteTabletITest
was a bit flaky. The system catalog was sometimes sending in
DeleteTablet RPCs when the original tablet server was still starting
up, so even if DeleteTablet requests went through and were accounted
for, they were responded with an error status because the tablet
manager of the original tablet server wasn't ready to receive such
requests yet, and the catalog manager retried its requests after
receiving such responses.
This patch addresses the flakiness and fixes a typo in the name of the
scenario. The scenario's dist-test stats for a DEBUG build are below.
Before:
6 out of 1024 failed:
http://dist-test.cloudera.org/job?job_id=aserbin.1682129573.9602
After:
0 out of 1024 failed:
http://dist-test.cloudera.org/job?job_id=aserbin.1682133036.42291
Change-Id: I3d3dee89b32d1e33d1f0f41e8b83835b02eae336
Reviewed-on: http://gerrit.cloudera.org:8080/19785
Reviewed-by: Abhishek Chennaka <ac...@cloudera.com>
Tested-by: Alexey Serbin <al...@apache.org>
---
src/kudu/integration-tests/delete_tablet-itest.cc | 44 +++++++++++++++++------
1 file changed, 33 insertions(+), 11 deletions(-)
diff --git a/src/kudu/integration-tests/delete_tablet-itest.cc b/src/kudu/integration-tests/delete_tablet-itest.cc
index 9b58c4e4b..be8c4b9e7 100644
--- a/src/kudu/integration-tests/delete_tablet-itest.cc
+++ b/src/kudu/integration-tests/delete_tablet-itest.cc
@@ -238,14 +238,15 @@ TEST_F(DeleteTabletITest, TestNoOpDeleteTabletRPC) {
ASSERT_EQ(0, flush_count_after - flush_count_before);
}
-// Regression test for KUDU-3341: Ensure that master would not retry to send
+// Regression test for KUDU-3341: ensure that master would not retry sending
// DeleteTablet() RPC to a "wrong server".
-TEST_F(DeleteTabletITest, TestNoMoreRetryWithWongServerUuid) {
+TEST_F(DeleteTabletITest, NoMoreRetryWithWrongServerUuid) {
SKIP_IF_SLOW_NOT_ALLOWED();
+ constexpr int kNumTablets = 3;
+ constexpr int kNumTabletServers = 4;
+
FLAGS_raft_heartbeat_interval_ms = 100;
FLAGS_follower_unavailable_considered_failed_sec = 2;
- const int kNumTablets = 3;
- const int kNumTabletServers = 4;
// Start a cluster and wait all tablets running and leader elected.
NO_FATALS(StartCluster(kNumTabletServers));
@@ -253,8 +254,7 @@ TEST_F(DeleteTabletITest, TestNoMoreRetryWithWongServerUuid) {
workload.set_num_tablets(kNumTablets);
workload.Setup();
ASSERT_EVENTUALLY([&] {
- ClusterVerifier v(cluster_.get());
- ASSERT_OK(v.RunKsck());
+ ASSERT_OK(ClusterVerifier(cluster_.get()).RunKsck());
});
// Get number of replicas on ts-0.
@@ -267,7 +267,7 @@ TEST_F(DeleteTabletITest, TestNoMoreRetryWithWongServerUuid) {
}
// Stop ts-0 and wait for replacement of replicas finished.
- Sockaddr addr = ts->bound_rpc_addr();
+ const Sockaddr addr = ts->bound_rpc_addr();
ts->Shutdown();
SleepFor(MonoDelta::FromSeconds(2 * FLAGS_follower_unavailable_considered_failed_sec));
@@ -275,24 +275,46 @@ TEST_F(DeleteTabletITest, TestNoMoreRetryWithWongServerUuid) {
ASSERT_OK(cluster_->AddTabletServer(HostPort(addr)));
auto* new_ts = cluster_->mini_tablet_server(kNumTabletServers);
+
int64_t num_delete_tablet_rpcs = 0;
ASSERT_EVENTUALLY([&] {
ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(new_ts->bound_http_addr()), &num_delete_tablet_rpcs));
ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
});
- // Sleep enough time to verify no additional DeleteTablet RPCs are sent by master.
+ // Sleep for some time and verify that no additional DeleteTablet RPCs
+ // are sent by the system catalog.
SleepFor(MonoDelta::FromSeconds(5));
+ ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(new_ts->bound_http_addr()), &num_delete_tablet_rpcs));
ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
- // Stop the new tablet server and restart ts-0, finally outdated tablets on ts-0 would be deleted.
+ // Stop the new tablet server and start ts-0 back. The outdated tablet
+ // replicas on ts-0 should be deleted.
new_ts->Shutdown();
+
+ // Shutdown the system catalog before starting up ts-0. That's to avoid
+ // receiving DeleteTablet calls when tablet server isn't yet able to process
+ // them, but it still counts them in the metric that GetNumDeleteTabletRPCs()
+ // retrieves the stats from.
+ for (auto i = 0; i < cluster_->num_masters(); ++i) {
+ cluster_->mini_master(i)->Shutdown();
+ }
+
ASSERT_OK(ts->Start());
+
+ for (auto i = 0; i < cluster_->num_masters(); ++i) {
+ ASSERT_OK(cluster_->mini_master(i)->Restart());
+ }
+
ASSERT_EVENTUALLY([&] {
ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(ts->bound_http_addr()), &num_delete_tablet_rpcs));
ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
- int num_live_tablets = ts->server()->tablet_manager()->GetNumLiveTablets();
- ASSERT_EQ(0, num_live_tablets);
+ ASSERT_EQ(0, ts->server()->tablet_manager()->GetNumLiveTablets());
});
+ // Sleep for some time and verify that no additional DeleteTablet RPCs
+ // are sent by the system catalog.
+ SleepFor(MonoDelta::FromSeconds(5));
+ ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(ts->bound_http_addr()), &num_delete_tablet_rpcs));
+ ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
}
} // namespace kudu