You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2016/03/09 23:24:22 UTC
[4/4] incubator-kudu git commit: Fix flakiness in
remote_bootstrap-itest
Fix flakiness in remote_bootstrap-itest
This test deleted a tablet, and then checked to see that the tablet data state
was TABLET_DATA_COPYING. However, it was possible for the remote bootstrap to
complete so quickly that it would already be TABLET_DATA_READY by the time
we checked, making the test flaky.
This changes the check to allow either COPYING or READY states.
Without the patch, it was around 8% flaky[1]. With the patch, it's not flaky[2].
[1] http://dist-test.cloudera.org/job?job_id=todd.1457557419.27817
[2] http://dist-test.cloudera.org/job?job_id=todd.1457559502.30323
Change-Id: I07b53a4209316b6a543f4d3d39ff17bed3890dd9
Reviewed-on: http://gerrit.cloudera.org:8080/2505
Tested-by: Kudu Jenkins
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/54839984
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/54839984
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/54839984
Branch: refs/heads/master
Commit: 54839984932bca0c0ba49cdd8fa199a5711e589e
Parents: 6b0e5d7
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Mar 9 13:41:38 2016 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Wed Mar 9 22:23:57 2016 +0000
----------------------------------------------------------------------
src/kudu/integration-tests/delete_table-test.cc | 18 ++++----
.../external_mini_cluster_fs_inspector.cc | 43 +++++++++++++-------
.../external_mini_cluster_fs_inspector.h | 4 +-
.../integration-tests/remote_bootstrap-itest.cc | 8 ++--
.../tablet_replacement-itest.cc | 10 +++--
src/kudu/tools/kudu-ts-cli-test.cc | 2 +-
6 files changed, 52 insertions(+), 33 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/54839984/src/kudu/integration-tests/delete_table-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/delete_table-test.cc b/src/kudu/integration-tests/delete_table-test.cc
index d139caa..071b329 100644
--- a/src/kudu/integration-tests/delete_table-test.cc
+++ b/src/kudu/integration-tests/delete_table-test.cc
@@ -138,7 +138,7 @@ Status DeleteTableTest::CheckTabletTombstonedOrDeletedOnTS(
return Status::IllegalState("Expected cmeta for tablet " + tablet_id + " but it doesn't exist");
}
if (is_superblock_expected == SUPERBLOCK_EXPECTED) {
- RETURN_NOT_OK(inspect_->CheckTabletDataStateOnTS(index, tablet_id, data_state));
+ RETURN_NOT_OK(inspect_->CheckTabletDataStateOnTS(index, tablet_id, { data_state }));
} else {
TabletSuperBlockPB superblock_pb;
Status s = inspect_->ReadTabletSuperBlockOnTS(index, tablet_id, &superblock_pb);
@@ -368,18 +368,18 @@ TEST_F(DeleteTableTest, TestAtomicDeleteTablet) {
opid_index = -1;
ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, opid_index, timeout,
&error_code));
- inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_TOMBSTONED);
+ inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_TOMBSTONED });
// Now that the tablet is already tombstoned, our opid_index should be
// ignored (because it's impossible to check it).
ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, -9999, timeout,
&error_code));
- inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_TOMBSTONED);
+ inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_TOMBSTONED });
// Same with TOMBSTONED -> DELETED.
ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_DELETED, -9999, timeout,
&error_code));
- inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_DELETED);
+ inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_DELETED });
}
TEST_F(DeleteTableTest, TestDeleteTableWithConcurrentWrites) {
@@ -472,7 +472,7 @@ TEST_F(DeleteTableTest, TestAutoTombstoneAfterCrashDuringRemoteBootstrap) {
NO_FATALS(WaitForTSToCrash(kTsIndex));
// The superblock should be in TABLET_DATA_COPYING state on disk.
- NO_FATALS(inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_COPYING));
+ NO_FATALS(inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_COPYING }));
// Kill the other tablet servers so the leader doesn't try to remote
// bootstrap it again during our verification here.
@@ -557,7 +557,7 @@ TEST_F(DeleteTableTest, TestAutoTombstoneAfterRemoteBootstrapRemoteFails) {
// consensus metadata.
ASSERT_OK(cluster_->tablet_server(1)->Restart());
ASSERT_OK(cluster_->tablet_server(2)->Restart());
- ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_READY));
+ ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_READY }));
ClusterVerifier v(cluster_.get());
NO_FATALS(v.CheckCluster());
NO_FATALS(v.CheckRowCount(workload.table_name(), ClusterVerifier::AT_LEAST,
@@ -573,7 +573,7 @@ TEST_F(DeleteTableTest, TestAutoTombstoneAfterRemoteBootstrapRemoteFails) {
// This time, the leader will have replaced a tablet with consensus metadata.
ASSERT_OK(cluster_->tablet_server(1)->Resume());
ASSERT_OK(cluster_->tablet_server(2)->Resume());
- ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_READY));
+ ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_READY }));
NO_FATALS(v.CheckCluster());
NO_FATALS(v.CheckRowCount(workload.table_name(), ClusterVerifier::AT_LEAST,
@@ -658,7 +658,7 @@ TEST_F(DeleteTableTest, TestMergeConsensusMetadata) {
LOG(INFO) << "Bringing TS " << cluster_->tablet_server(kTsIndex)->uuid()
<< " back up...";
ASSERT_OK(cluster_->tablet_server(kTsIndex)->Restart());
- ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_READY));
+ ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_READY }));
// Assert that the election history is retained (voted for self).
ASSERT_OK(inspect_->ReadConsensusMetadataOnTS(kTsIndex, tablet_id, &cmeta_pb));
@@ -681,7 +681,7 @@ TEST_F(DeleteTableTest, TestMergeConsensusMetadata) {
NO_FATALS(WaitUntilTabletRunning(1, tablet_id));
NO_FATALS(WaitUntilTabletRunning(2, tablet_id));
ASSERT_OK(itest::StartElection(leader, tablet_id, timeout));
- ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_READY));
+ ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, { TABLET_DATA_READY }));
// The election history should have been wiped out.
ASSERT_OK(inspect_->ReadConsensusMetadataOnTS(kTsIndex, tablet_id, &cmeta_pb));
http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/54839984/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
index e1b5ca6..a350f55 100644
--- a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
+++ b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc
@@ -181,16 +181,30 @@ Status ExternalMiniClusterFsInspector::ReadConsensusMetadataOnTS(int index,
return pb_util::ReadPBContainerFromPath(env_, cmeta_file, cmeta_pb);
}
-Status ExternalMiniClusterFsInspector::CheckTabletDataStateOnTS(int index,
- const string& tablet_id,
- TabletDataState state) {
+Status ExternalMiniClusterFsInspector::CheckTabletDataStateOnTS(
+ int index,
+ const string& tablet_id,
+ const vector<TabletDataState>& allowed_states) {
+
TabletSuperBlockPB sb;
RETURN_NOT_OK(ReadTabletSuperBlockOnTS(index, tablet_id, &sb));
- if (PREDICT_FALSE(sb.tablet_data_state() != state)) {
- return Status::IllegalState("Tablet data state != " + TabletDataState_Name(state),
- TabletDataState_Name(sb.tablet_data_state()));
+ if (std::find(allowed_states.begin(), allowed_states.end(), sb.tablet_data_state()) !=
+ allowed_states.end()) {
+ return Status::OK();
}
- return Status::OK();
+
+ vector<string> state_names;
+ for (auto state : allowed_states) {
+ state_names.push_back(TabletDataState_Name(state));
+ }
+ string expected_str = JoinStrings(state_names, ",");
+ if (state_names.size() > 1) {
+ expected_str = "one of: " + expected_str;
+ }
+
+ return Status::IllegalState(Substitute("State $0 unexpected, expected $1",
+ TabletDataState_Name(sb.tablet_data_state()),
+ expected_str));
}
Status ExternalMiniClusterFsInspector::WaitForNoData(const MonoDelta& timeout) {
@@ -262,23 +276,24 @@ Status ExternalMiniClusterFsInspector::WaitForReplicaCount(int expected, const M
expected, found));
}
-Status ExternalMiniClusterFsInspector::WaitForTabletDataStateOnTS(int index,
- const string& tablet_id,
- TabletDataState expected,
- const MonoDelta& timeout) {
+Status ExternalMiniClusterFsInspector::WaitForTabletDataStateOnTS(
+ int index,
+ const string& tablet_id,
+ const vector<TabletDataState>& expected_states,
+ const MonoDelta& timeout) {
MonoTime start = MonoTime::Now(MonoTime::FINE);
MonoTime deadline = start;
deadline.AddDelta(timeout);
Status s;
while (true) {
- s = CheckTabletDataStateOnTS(index, tablet_id, expected);
+ s = CheckTabletDataStateOnTS(index, tablet_id, expected_states);
if (s.ok()) return Status::OK();
if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) break;
SleepFor(MonoDelta::FromMilliseconds(5));
}
- return Status::TimedOut(Substitute("Timed out after $0 waiting for tablet data state $1: $2",
+ return Status::TimedOut(Substitute("Timed out after $0 waiting for correct tablet state: $1",
MonoTime::Now(MonoTime::FINE).GetDeltaSince(start).ToString(),
- TabletDataState_Name(expected), s.ToString()));
+ s.ToString()));
}
Status ExternalMiniClusterFsInspector::WaitForFilePatternInTabletWalDirOnTs(
http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/54839984/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
index ff160da..c4f21ad 100644
--- a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
+++ b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h
@@ -77,7 +77,7 @@ class ExternalMiniClusterFsInspector {
consensus::ConsensusMetadataPB* cmeta_pb);
Status CheckTabletDataStateOnTS(int index,
const std::string& tablet_id,
- tablet::TabletDataState state);
+ const std::vector<tablet::TabletDataState>& expected_states);
Status WaitForNoData(const MonoDelta& timeout = MonoDelta::FromSeconds(30));
Status WaitForNoDataOnTS(int index, const MonoDelta& timeout = MonoDelta::FromSeconds(30));
@@ -88,7 +88,7 @@ class ExternalMiniClusterFsInspector {
Status WaitForReplicaCount(int expected, const MonoDelta& timeout = MonoDelta::FromSeconds(30));
Status WaitForTabletDataStateOnTS(int index,
const std::string& tablet_id,
- tablet::TabletDataState data_state,
+ const std::vector<tablet::TabletDataState>& expected_states,
const MonoDelta& timeout = MonoDelta::FromSeconds(30));
// Loop and check for certain filenames in the WAL directory of the specified
http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/54839984/src/kudu/integration-tests/remote_bootstrap-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/remote_bootstrap-itest.cc b/src/kudu/integration-tests/remote_bootstrap-itest.cc
index dcc1b28..e0c2b06 100644
--- a/src/kudu/integration-tests/remote_bootstrap-itest.cc
+++ b/src/kudu/integration-tests/remote_bootstrap-itest.cc
@@ -557,8 +557,10 @@ TEST_F(RemoteBootstrapITest, TestDeleteLeaderDuringRemoteBootstrapStressTest) {
timeout));
// Wait for remote bootstrap to start.
- ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(follower_index, tablet_id,
- tablet::TABLET_DATA_COPYING, timeout));
+ ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(
+ follower_index, tablet_id,
+ { tablet::TABLET_DATA_COPYING, tablet::TABLET_DATA_READY },
+ timeout));
// Tombstone the leader.
LOG(INFO) << "Tombstoning leader tablet " << tablet_id << " on TS " << leader_ts->uuid();
@@ -727,7 +729,7 @@ TEST_F(RemoteBootstrapITest, TestSlowBootstrapDoesntFail) {
// Wait for remote bootstrap to start.
ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(1, tablet_id,
- tablet::TABLET_DATA_COPYING, timeout));
+ { tablet::TABLET_DATA_COPYING }, timeout));
workload.StopAndJoin();
ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1));
http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/54839984/src/kudu/integration-tests/tablet_replacement-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/tablet_replacement-itest.cc b/src/kudu/integration-tests/tablet_replacement-itest.cc
index 0a1d350..7418238 100644
--- a/src/kudu/integration-tests/tablet_replacement-itest.cc
+++ b/src/kudu/integration-tests/tablet_replacement-itest.cc
@@ -84,7 +84,8 @@ TEST_F(TabletReplacementITest, TestMasterTombstoneEvictedReplica) {
ASSERT_OK(itest::RemoveServer(leader_ts, tablet_id, follower_ts, boost::none, timeout));
// Wait for the Master to tombstone the replica.
- ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_TOMBSTONED,
+ ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id,
+ { TABLET_DATA_TOMBSTONED },
timeout));
if (!AllowSlowTests()) {
@@ -105,14 +106,14 @@ TEST_F(TabletReplacementITest, TestMasterTombstoneEvictedReplica) {
Status s = itest::AddServer(leader_ts, tablet_id, follower_ts, RaftPeerPB::VOTER,
boost::none, MonoDelta::FromSeconds(5));
ASSERT_TRUE(s.IsTimedOut());
- ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_READY,
+ ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, { TABLET_DATA_READY },
timeout));
ASSERT_OK(itest::WaitForServersToAgree(timeout, active_ts_map, tablet_id, 3));
// Sleep for a few more seconds and check again to ensure that the Master
// didn't end up tombstoning the replica.
SleepFor(MonoDelta::FromSeconds(3));
- ASSERT_OK(inspect_->CheckTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_READY));
+ ASSERT_OK(inspect_->CheckTabletDataStateOnTS(kFollowerIndex, tablet_id, { TABLET_DATA_READY }));
}
// Ensure that the Master will tombstone a replica if it reports in with an old
@@ -166,7 +167,8 @@ TEST_F(TabletReplacementITest, TestMasterTombstoneOldReplicaOnReport) {
ASSERT_OK(cluster_->tablet_server(kFollowerIndex)->Restart());
// Wait for the Master to tombstone the revived follower.
- ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_TOMBSTONED,
+ ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id,
+ { TABLET_DATA_TOMBSTONED },
timeout));
}
http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/54839984/src/kudu/tools/kudu-ts-cli-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/kudu-ts-cli-test.cc b/src/kudu/tools/kudu-ts-cli-test.cc
index 2a65de0..5317334 100644
--- a/src/kudu/tools/kudu-ts-cli-test.cc
+++ b/src/kudu/tools/kudu-ts-cli-test.cc
@@ -89,7 +89,7 @@ TEST_F(KuduTsCliTest, TestDeleteTablet) {
argv.push_back("Deleting for kudu-ts-cli-test");
ASSERT_OK(Subprocess::Call(argv));
- ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(0, tablet_id, tablet::TABLET_DATA_TOMBSTONED));
+ ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(0, tablet_id, { tablet::TABLET_DATA_TOMBSTONED }));
TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()];
ASSERT_OK(itest::WaitUntilTabletInState(ts, tablet_id, tablet::SHUTDOWN, timeout));
}