You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2016/03/09 23:24:22 UTC

[4/4] incubator-kudu git commit: Fix flakiness in remote_bootstrap-itest

Fix flakiness in remote_bootstrap-itest

This test deleted a tablet, and then checked to see that the tablet data state
was TABLET_DATA_COPYING. However, it was possible for the remote bootstrap to
complete so quickly that it would already be TABLET_DATA_READY by the time
we checked, making the test flaky.

This changes the check to allow either COPYING or READY states.

Without the patch, it was around 8% flaky[1]. With the patch, it's not flaky[2].

[1] http://dist-test.cloudera.org/job?job_id=todd.1457557419.27817
[2] http://dist-test.cloudera.org/job?job_id=todd.1457559502.30323

Change-Id: I07b53a4209316b6a543f4d3d39ff17bed3890dd9
Reviewed-on: http://gerrit.cloudera.org:8080/2505
Tested-by: Kudu Jenkins
Reviewed-by: Adar Dembo <ad...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/54839984
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/54839984
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/54839984

Branch: refs/heads/master
Commit: 54839984932bca0c0ba49cdd8fa199a5711e589e
Parents: 6b0e5d7
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Mar 9 13:41:38 2016 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Wed Mar 9 22:23:57 2016 +0000

----------------------------------------------------------------------
 src/kudu/integration-tests/delete_table-test.cc | 18 ++++----
 .../external_mini_cluster_fs_inspector.cc       | 43 +++++++++++++-------
 .../external_mini_cluster_fs_inspector.h        |  4 +-
 .../integration-tests/remote_bootstrap-itest.cc |  8 ++--
 .../tablet_replacement-itest.cc                 | 10 +++--
 src/kudu/tools/kudu-ts-cli-test.cc              |  2 +-
 6 files changed, 52 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/54839984/src/kudu/integration-tests/delete_table-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/delete_table-test.cc b/src/kudu/integration-tests/delete_table-test.cc
index d139caa..071b329 100644
--- a/src/kudu/integration-tests/delete_table-test.cc
+++ b/src/kudu/integration-tests/delete_table-test.cc
@@ -138,7 +138,7 @@ Status DeleteTableTest::CheckTabletTombstonedOrDeletedOnTS(
     return Status::IllegalState("Expected cmeta for tablet " + tablet_id + " but it doesn't exist");
   }
   if (is_superblock_expected == SUPERBLOCK_EXPECTED) {
-    RETURN_NOT_OK(inspect_->CheckTabletDataStateOnTS(index, tablet_id, data_state));
+    RETURN_NOT_OK(inspect_->CheckTabletDataStateOnTS(index, tablet_id, { data_state }));
   } else {
     TabletSuperBlockPB superblock_pb;
     Status s = inspect_->ReadTabletSuperBlockOnTS(index, tablet_id, &superblock_pb);
@@ -368,18 +368,18 @@ TEST_F(DeleteTableTest, TestAtomicDeleteTablet) {
   opid_index = -1;
   ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, opid_index, timeout,
                                 &error_code));
-  inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_TOMBSTONED);
+  inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_TOMBSTONED });
 
   // Now that the tablet is already tombstoned, our opid_index should be
   // ignored (because it's impossible to check it).
   ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, -9999, timeout,
                                 &error_code));
-  inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_TOMBSTONED);
+  inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_TOMBSTONED });
 
   // Same with TOMBSTONED -> DELETED.
   ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_DELETED, -9999, timeout,
                                 &error_code));
-  inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_DELETED);
+  inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_DELETED });
 }
 
 TEST_F(DeleteTableTest, TestDeleteTableWithConcurrentWrites) {
@@ -472,7 +472,7 @@ TEST_F(DeleteTableTest, TestAutoTombstoneAfterCrashDuringRemoteBootstrap) {
   NO_FATALS(WaitForTSToCrash(kTsIndex));
 
   // The superblock should be in TABLET_DATA_COPYING state on disk.
-  NO_FATALS(inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_COPYING));
+  NO_FATALS(inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_COPYING }));
 
   // Kill the other tablet servers so the leader doesn't try to remote
   // bootstrap it again during our verification here.
@@ -557,7 +557,7 @@ TEST_F(DeleteTableTest, TestAutoTombstoneAfterRemoteBootstrapRemoteFails) {
   // consensus metadata.
   ASSERT_OK(cluster_->tablet_server(1)->Restart());
   ASSERT_OK(cluster_->tablet_server(2)->Restart());
-  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_READY));
+  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_READY }));
   ClusterVerifier v(cluster_.get());
   NO_FATALS(v.CheckCluster());
   NO_FATALS(v.CheckRowCount(workload.table_name(), ClusterVerifier::AT_LEAST,
@@ -573,7 +573,7 @@ TEST_F(DeleteTableTest, TestAutoTombstoneAfterRemoteBootstrapRemoteFails) {
   // This time, the leader will have replaced a tablet with consensus metadata.
   ASSERT_OK(cluster_->tablet_server(1)->Resume());
   ASSERT_OK(cluster_->tablet_server(2)->Resume());
-  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_READY));
+  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_READY }));
 
   NO_FATALS(v.CheckCluster());
   NO_FATALS(v.CheckRowCount(workload.table_name(), ClusterVerifier::AT_LEAST,
@@ -658,7 +658,7 @@ TEST_F(DeleteTableTest, TestMergeConsensusMetadata) {
   LOG(INFO) << "Bringing TS " << cluster_->tablet_server(kTsIndex)->uuid()
             << " back up...";
   ASSERT_OK(cluster_->tablet_server(kTsIndex)->Restart());
-  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_READY));
+  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_READY }));
 
   // Assert that the election history is retained (voted for self).
   ASSERT_OK(inspect_->ReadConsensusMetadataOnTS(kTsIndex, tablet_id, &cmeta_pb));
@@ -681,7 +681,7 @@ TEST_F(DeleteTableTest, TestMergeConsensusMetadata) {
   NO_FATALS(WaitUntilTabletRunning(1, tablet_id));
   NO_FATALS(WaitUntilTabletRunning(2, tablet_id));
   ASSERT_OK(itest::StartElection(leader, tablet_id, timeout));
-  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_READY));
+  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_READY }));
 
   // The election history should have been wiped out.
   ASSERT_OK(inspect_->ReadConsensusMetadataOnTS(kTsIndex, tablet_id, &cmeta_pb));

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/54839984/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
index e1b5ca6..a350f55 100644
--- a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
+++ b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
@@ -181,16 +181,30 @@ Status ExternalMiniClusterFsInspector::ReadConsensusMetadataOnTS(int index,
   return pb_util::ReadPBContainerFromPath(env_, cmeta_file, cmeta_pb);
 }
 
-Status ExternalMiniClusterFsInspector::CheckTabletDataStateOnTS(int index,
-                                                                const string& tablet_id,
-                                                                TabletDataState state) {
+Status ExternalMiniClusterFsInspector::CheckTabletDataStateOnTS(
+    int index,
+    const string& tablet_id,
+    const vector<TabletDataState>& allowed_states) {
+
   TabletSuperBlockPB sb;
   RETURN_NOT_OK(ReadTabletSuperBlockOnTS(index, tablet_id, &sb));
-  if (PREDICT_FALSE(sb.tablet_data_state() != state)) {
-    return Status::IllegalState("Tablet data state != " + TabletDataState_Name(state),
-                                TabletDataState_Name(sb.tablet_data_state()));
+  if (std::find(allowed_states.begin(), allowed_states.end(), sb.tablet_data_state()) !=
+      allowed_states.end()) {
+    return Status::OK();
   }
-  return Status::OK();
+
+  vector<string> state_names;
+  for (auto state : allowed_states) {
+    state_names.push_back(TabletDataState_Name(state));
+  }
+  string expected_str = JoinStrings(state_names, ",");
+  if (state_names.size() > 1) {
+    expected_str = "one of: " + expected_str;
+  }
+
+  return Status::IllegalState(Substitute("State $0 unexpected, expected $1",
+                                         TabletDataState_Name(sb.tablet_data_state()),
+                                         expected_str));
 }
 
 Status ExternalMiniClusterFsInspector::WaitForNoData(const MonoDelta& timeout) {
@@ -262,23 +276,24 @@ Status ExternalMiniClusterFsInspector::WaitForReplicaCount(int expected, const M
                                      expected, found));
 }
 
-Status ExternalMiniClusterFsInspector::WaitForTabletDataStateOnTS(int index,
-                                                                  const string& tablet_id,
-                                                                  TabletDataState expected,
-                                                                  const MonoDelta& timeout) {
+Status ExternalMiniClusterFsInspector::WaitForTabletDataStateOnTS(
+    int index,
+    const string& tablet_id,
+    const vector<TabletDataState>& expected_states,
+    const MonoDelta& timeout) {
   MonoTime start = MonoTime::Now(MonoTime::FINE);
   MonoTime deadline = start;
   deadline.AddDelta(timeout);
   Status s;
   while (true) {
-    s = CheckTabletDataStateOnTS(index, tablet_id, expected);
+    s = CheckTabletDataStateOnTS(index, tablet_id, expected_states);
     if (s.ok()) return Status::OK();
     if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) break;
     SleepFor(MonoDelta::FromMilliseconds(5));
   }
-  return Status::TimedOut(Substitute("Timed out after $0 waiting for tablet data state $1: $2",
+  return Status::TimedOut(Substitute("Timed out after $0 waiting for correct tablet state: $1",
                                      MonoTime::Now(MonoTime::FINE).GetDeltaSince(start).ToString(),
-                                     TabletDataState_Name(expected), s.ToString()));
+                                     s.ToString()));
 }
 
 Status ExternalMiniClusterFsInspector::WaitForFilePatternInTabletWalDirOnTs(

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/54839984/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
index ff160da..c4f21ad 100644
--- a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
+++ b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
@@ -77,7 +77,7 @@ class ExternalMiniClusterFsInspector {
                                    consensus::ConsensusMetadataPB* cmeta_pb);
   Status CheckTabletDataStateOnTS(int index,
                                   const std::string& tablet_id,
-                                  tablet::TabletDataState state);
+                                  const std::vector<tablet::TabletDataState>& expected_states);
 
   Status WaitForNoData(const MonoDelta& timeout = MonoDelta::FromSeconds(30));
   Status WaitForNoDataOnTS(int index, const MonoDelta& timeout = MonoDelta::FromSeconds(30));
@@ -88,7 +88,7 @@ class ExternalMiniClusterFsInspector {
   Status WaitForReplicaCount(int expected, const MonoDelta& timeout = MonoDelta::FromSeconds(30));
   Status WaitForTabletDataStateOnTS(int index,
                                     const std::string& tablet_id,
-                                    tablet::TabletDataState data_state,
+                                    const std::vector<tablet::TabletDataState>& expected_states,
                                     const MonoDelta& timeout = MonoDelta::FromSeconds(30));
 
   // Loop and check for certain filenames in the WAL directory of the specified

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/54839984/src/kudu/integration-tests/remote_bootstrap-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/remote_bootstrap-itest.cc b/src/kudu/integration-tests/remote_bootstrap-itest.cc
index dcc1b28..e0c2b06 100644
--- a/src/kudu/integration-tests/remote_bootstrap-itest.cc
+++ b/src/kudu/integration-tests/remote_bootstrap-itest.cc
@@ -557,8 +557,10 @@ TEST_F(RemoteBootstrapITest, TestDeleteLeaderDuringRemoteBootstrapStressTest) {
                                   timeout));
 
     // Wait for remote bootstrap to start.
-    ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(follower_index, tablet_id,
-                                                   tablet::TABLET_DATA_COPYING, timeout));
+    ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(
+        follower_index, tablet_id,
+        { tablet::TABLET_DATA_COPYING, tablet::TABLET_DATA_READY },
+        timeout));
 
     // Tombstone the leader.
     LOG(INFO) << "Tombstoning leader tablet " << tablet_id << " on TS " << leader_ts->uuid();
@@ -727,7 +729,7 @@ TEST_F(RemoteBootstrapITest, TestSlowBootstrapDoesntFail) {
 
   // Wait for remote bootstrap to start.
   ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(1, tablet_id,
-                                                 tablet::TABLET_DATA_COPYING, timeout));
+                                                 { tablet::TABLET_DATA_COPYING }, timeout));
 
   workload.StopAndJoin();
   ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1));

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/54839984/src/kudu/integration-tests/tablet_replacement-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/tablet_replacement-itest.cc b/src/kudu/integration-tests/tablet_replacement-itest.cc
index 0a1d350..7418238 100644
--- a/src/kudu/integration-tests/tablet_replacement-itest.cc
+++ b/src/kudu/integration-tests/tablet_replacement-itest.cc
@@ -84,7 +84,8 @@ TEST_F(TabletReplacementITest, TestMasterTombstoneEvictedReplica) {
   ASSERT_OK(itest::RemoveServer(leader_ts, tablet_id, follower_ts, boost::none, timeout));
 
   // Wait for the Master to tombstone the replica.
-  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_TOMBSTONED,
+  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id,
+                                                 { TABLET_DATA_TOMBSTONED },
                                                  timeout));
 
   if (!AllowSlowTests()) {
@@ -105,14 +106,14 @@ TEST_F(TabletReplacementITest, TestMasterTombstoneEvictedReplica) {
   Status s = itest::AddServer(leader_ts, tablet_id, follower_ts, RaftPeerPB::VOTER,
                               boost::none, MonoDelta::FromSeconds(5));
   ASSERT_TRUE(s.IsTimedOut());
-  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_READY,
+  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, { TABLET_DATA_READY },
                                                  timeout));
   ASSERT_OK(itest::WaitForServersToAgree(timeout, active_ts_map, tablet_id, 3));
 
   // Sleep for a few more seconds and check again to ensure that the Master
   // didn't end up tombstoning the replica.
   SleepFor(MonoDelta::FromSeconds(3));
-  ASSERT_OK(inspect_->CheckTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_READY));
+  ASSERT_OK(inspect_->CheckTabletDataStateOnTS(kFollowerIndex, tablet_id, { TABLET_DATA_READY }));
 }
 
 // Ensure that the Master will tombstone a replica if it reports in with an old
@@ -166,7 +167,8 @@ TEST_F(TabletReplacementITest, TestMasterTombstoneOldReplicaOnReport) {
   ASSERT_OK(cluster_->tablet_server(kFollowerIndex)->Restart());
 
   // Wait for the Master to tombstone the revived follower.
-  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_TOMBSTONED,
+  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id,
+                                                 { TABLET_DATA_TOMBSTONED },
                                                  timeout));
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/54839984/src/kudu/tools/kudu-ts-cli-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/kudu-ts-cli-test.cc b/src/kudu/tools/kudu-ts-cli-test.cc
index 2a65de0..5317334 100644
--- a/src/kudu/tools/kudu-ts-cli-test.cc
+++ b/src/kudu/tools/kudu-ts-cli-test.cc
@@ -89,7 +89,7 @@ TEST_F(KuduTsCliTest, TestDeleteTablet) {
   argv.push_back("Deleting for kudu-ts-cli-test");
   ASSERT_OK(Subprocess::Call(argv));
 
-  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(0, tablet_id, tablet::TABLET_DATA_TOMBSTONED));
+  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(0, tablet_id, { tablet::TABLET_DATA_TOMBSTONED }));
   TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()];
   ASSERT_OK(itest::WaitUntilTabletInState(ts, tablet_id, tablet::SHUTDOWN, timeout));
 }