You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2018/01/31 03:17:04 UTC
kudu git commit: [tests] de-flaking DontEvictIfRemainingConfigIsUnstable

Repository: kudu
Updated Branches:
  refs/heads/master 54081ce18 -> eef0e3942


[tests] de-flaking DontEvictIfRemainingConfigIsUnstable

Tuned configuration of the test cluster to minimize chances of reporting
on failed tablet replicas 'one-by-one' instead of 'two-at-once' for the
DontEvictIfRemainingConfigIsUnstableITest-derived scenarios.  The
'one-by-one' reporting on failed replicas was the reason behind
flakiness in the following scenarios:

  * DontEvictIfRemainingConfigIsUnstableITest.NodesDown/{0,1}
  * DontEvictIfRemainingConfigIsUnstableITest.NodesStopped/{0,1}

That was because the logic of those test scenarios assumes 2 out of 3
replicas becoming unresponsive at once.  If only 1 out of 3 replicas
became unresponsive and reported as such, the re-replication logic
(both 3-2-3 and 3-4-3 schemes) tried to initiate replacement of the
failed replica.  That's completely legit since at the time of the single
replica failure report the majority of replicas was reported as being
online.

As an additional fix, the ASSERT_EVENTUALLY() wrapper is added for the
invocation of GetTabletToReplicaUUIDsMapping().  That's to guarantee
that all relevant tablet servers are included into the mapping being
built.

Change-Id: Icb23d71b600a46f7fd6bba56091f884eae24784f
Reviewed-on: http://gerrit.cloudera.org:8080/9126
Tested-by: Kudu Jenkins
Reviewed-by: Mike Percy <mp...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/eef0e394
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/eef0e394
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/eef0e394

Branch: refs/heads/master
Commit: eef0e39424febe665432ac3e90db00a8da6b71b3
Parents: 54081ce
Author: Alexey Serbin <as...@cloudera.com>
Authored: Wed Jan 24 10:36:22 2018 -0800
Committer: Mike Percy <mp...@apache.org>
Committed: Wed Jan 31 03:12:22 2018 +0000

----------------------------------------------------------------------
 .../tablet_replacement-itest.cc                 | 41 ++++++++++++++------
 1 file changed, 29 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/eef0e394/src/kudu/integration-tests/tablet_replacement-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/tablet_replacement-itest.cc b/src/kudu/integration-tests/tablet_replacement-itest.cc
index 7390770..7ca9c3b 100644
--- a/src/kudu/integration-tests/tablet_replacement-itest.cc
+++ b/src/kudu/integration-tests/tablet_replacement-itest.cc
@@ -139,19 +139,29 @@ void TabletReplacementITest::TestDontEvictIfRemainingConfigIsUnstable(
     return;
   }
 
-  const MonoDelta kTimeout = MonoDelta::FromSeconds(10);
+  // The configuration is tuned to minimize chances of reporting on failed
+  // tablet replicas one-by-one. That's because by the scenario 2 replicas out
+  // of 3 are becoming unresponsive, and the scenario assumes the decision
+  // on whether to replace failed replicas is made knowing about both failed
+  // replicas.
+  const MonoDelta kTimeout = MonoDelta::FromSeconds(60);
   constexpr auto kUnavailableSec = 3;
+  constexpr auto kTsToMasterHbIntervalSec = 2 * kUnavailableSec;
   constexpr auto kConsensusRpcTimeoutSec = 2;
   constexpr auto kNumReplicas = 3;
   const vector<string> ts_flags = {
-    "--enable_leader_failure_detection=false",
+    Substitute("--raft_prepare_replacement_before_eviction=$0", is_3_4_3_mode),
+
     Substitute("--follower_unavailable_considered_failed_sec=$0", kUnavailableSec),
     Substitute("--consensus_rpc_timeout_ms=$0", kConsensusRpcTimeoutSec * 1000),
-    Substitute("--raft_prepare_replacement_before_eviction=$0", is_3_4_3_mode),
+    Substitute("--heartbeat_interval_ms=$0", kTsToMasterHbIntervalSec * 1000),
+    "--raft_heartbeat_interval_ms=50",
+    "--enable_leader_failure_detection=false",
   };
   const vector<string> master_flags = {
-    "--catalog_manager_wait_for_new_tablets_to_elect_leader=false",
     Substitute("--raft_prepare_replacement_before_eviction=$0", is_3_4_3_mode),
+
+    "--catalog_manager_wait_for_new_tablets_to_elect_leader=false",
   };
   // Additional tablet server is needed when running in 3-4-3 replica management
   // scheme to allow for eviction of failed tablet replicas.
@@ -163,12 +173,15 @@ void TabletReplacementITest::TestDontEvictIfRemainingConfigIsUnstable(
   workload.Setup(); // Easy way to create a new tablet.
 
   TabletToReplicaUUIDs tablet_to_replicas;
-  ASSERT_OK(GetTabletToReplicaUUIDsMapping(kTimeout, &tablet_to_replicas));
-  // There should be only one tablet.
-  ASSERT_EQ(1, tablet_to_replicas.size());
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(GetTabletToReplicaUUIDsMapping(kTimeout, &tablet_to_replicas));
+    // There should be only one tablet.
+    ASSERT_EQ(1, tablet_to_replicas.size());
+    // It takes some time to bootstrap all replicas across all tablet servers
+    ASSERT_EQ(kNumReplicas, tablet_to_replicas.cbegin()->second.size());
+  });
   const string tablet_id = tablet_to_replicas.cbegin()->first;
   const auto& replica_uuids = tablet_to_replicas.cbegin()->second;
-  ASSERT_EQ(kNumReplicas, replica_uuids.size());
 
   // Wait until all replicas are up and running.
   for (const auto& uuid : replica_uuids) {
@@ -215,6 +228,7 @@ void TabletReplacementITest::TestDontEvictIfRemainingConfigIsUnstable(
   // which was a bug that we had (KUDU-2230) and that this test also serves as
   // a regression test for.
   auto min_sleep_required_sec = std::max(kUnavailableSec, kConsensusRpcTimeoutSec);
+  min_sleep_required_sec = std::max(min_sleep_required_sec, kTsToMasterHbIntervalSec);
   SleepFor(MonoDelta::FromSeconds(2 * min_sleep_required_sec));
 
   {
@@ -495,12 +509,15 @@ TEST_P(EvictAndReplaceDeadFollowerITest, UnreachableFollower) {
   workload.Setup(); // Easy way to create a new tablet.
 
   TabletToReplicaUUIDs tablet_to_replicas;
-  ASSERT_OK(GetTabletToReplicaUUIDsMapping(kTimeout, &tablet_to_replicas));
-  // There should be only one tablet.
-  ASSERT_EQ(1, tablet_to_replicas.size());
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(GetTabletToReplicaUUIDsMapping(kTimeout, &tablet_to_replicas));
+    // There should be only one tablet.
+    ASSERT_EQ(1, tablet_to_replicas.size());
+    // It takes some time to bootstrap all replicas across all tablet servers
+    ASSERT_EQ(kNumReplicas, tablet_to_replicas.cbegin()->second.size());
+  });
   const string tablet_id = tablet_to_replicas.cbegin()->first;
   const auto& replica_uuids = tablet_to_replicas.cbegin()->second;
-  ASSERT_EQ(kNumReplicas, replica_uuids.size());
 
   // Wait until all replicas are up and running.
   for (const auto& uuid : replica_uuids) {