You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by da...@apache.org on 2017/08/31 04:02:16 UTC
[2/3] kudu git commit: consensus: Save previous last-logged OpId
across tablet copies
consensus: Save previous last-logged OpId across tablet copies
When a replica falls behind the leader's log, and then fails a tablet
copy, it should still be able to vote in leader elections while
tombstoned. With this patch, that becomes possible. This helps with
availability.
Without this patch, asking a post-tablet copy failure replica to vote
would result in the following RPC response:
IllegalState: must be running to vote when last-logged opid is not known
Change-Id: Ie886764adbdc6d84406df44d6cdb693d038bcd57
Reviewed-on: http://gerrit.cloudera.org:8080/7888
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <as...@cloudera.com>
Reviewed-by: David Ribeiro Alves <da...@gmail.com>
Reviewed-on: http://gerrit.cloudera.org:8080/7912
Reviewed-by: Dan Burkert <da...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/9d71f99a
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/9d71f99a
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/9d71f99a
Branch: refs/heads/branch-1.5.x
Commit: 9d71f99a9c54e20ff5ab76d91ad6e41b05fc5467
Parents: 0dbc67c
Author: Mike Percy <mp...@apache.org>
Authored: Tue Aug 29 14:20:24 2017 -0700
Committer: Dan Burkert <da...@apache.org>
Committed: Thu Aug 31 04:01:39 2017 +0000
----------------------------------------------------------------------
.../integration-tests/raft_consensus-itest.cc | 76 +++++++++++++++++---
src/kudu/tablet/tablet_metadata.cc | 4 +-
src/kudu/tserver/tablet_copy_client.cc | 9 +++
src/kudu/tserver/ts_tablet_manager.cc | 3 +-
src/kudu/tserver/ts_tablet_manager.h | 2 +-
5 files changed, 81 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/9d71f99a/src/kudu/integration-tests/raft_consensus-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest.cc b/src/kudu/integration-tests/raft_consensus-itest.cc
index 91fa7b4..8e4cf8a 100644
--- a/src/kudu/integration-tests/raft_consensus-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest.cc
@@ -141,6 +141,7 @@ using std::unordered_map;
using std::unordered_set;
using std::vector;
using strings::Substitute;
+using tablet::TABLET_DATA_COPYING;
static const int kConsensusRpcTimeoutForTests = 50;
@@ -446,6 +447,8 @@ void RaftConsensusITest::AddFlagsForLogRolls(vector<string>* extra_tserver_flags
extra_tserver_flags->push_back("--log_max_segments_to_retain=3");
extra_tserver_flags->push_back("--maintenance_manager_polling_interval_ms=100");
extra_tserver_flags->push_back("--log_target_replay_size_mb=1");
+ // We write 128KB cells in CauseFollowerToFallBehindLogGC(): bump the limit.
+ extra_tserver_flags->push_back("--max_cell_size_bytes=1000000");
}
// Test that we can retrieve the permanent uuid of a server running
@@ -831,8 +834,6 @@ TEST_F(RaftConsensusITest, TestFollowerFallsBehindLeaderGC) {
vector<string> ts_flags = {
// Disable follower eviction to maintain the original intent of this test.
"--evict_failed_followers=false",
- // We write 128KB cells in this test, so bump the limit.
- "--max_cell_size_bytes=1000000"
};
AddFlagsForLogRolls(&ts_flags); // For CauseFollowerToFallBehindLogGC().
NO_FATALS(BuildAndStart(ts_flags));
@@ -2813,10 +2814,7 @@ TEST_F(RaftConsensusITest, TestHammerOneRow) {
// Test that followers that fall behind the leader's log GC threshold are
// evicted from the config.
TEST_F(RaftConsensusITest, TestEvictAbandonedFollowers) {
- vector<string> ts_flags = {
- // We write 128KB cells in this test, so bump the limit.
- "--max_cell_size_bytes=1000000"
- };
+ vector<string> ts_flags;
AddFlagsForLogRolls(&ts_flags); // For CauseFollowerToFallBehindLogGC().
vector<string> master_flags = {
"--master_add_server_when_underreplicated=false",
@@ -2840,13 +2838,71 @@ TEST_F(RaftConsensusITest, TestEvictAbandonedFollowers) {
ASSERT_OK(WaitForServersToAgree(timeout, active_tablet_servers, tablet_id_, 2));
}
+// Have a replica fall behind the leader's log, then fail a tablet copy. It
+// should still be able to vote in leader elections.
+TEST_F(RaftConsensusITest, TestTombstonedVoteAfterFailedTabletCopy) {
+ const MonoDelta kTimeout = MonoDelta::FromSeconds(30);
+
+ vector<string> ts_flags;
+ AddFlagsForLogRolls(&ts_flags); // For CauseFollowerToFallBehindLogGC().
+ const vector<string> kMasterFlags = {
+ "--master_add_server_when_underreplicated=false",
+ };
+ NO_FATALS(BuildAndStart(ts_flags, kMasterFlags));
+
+ TabletServerMap active_tablet_servers = tablet_servers_;
+ ASSERT_EQ(3, FLAGS_num_replicas);
+ ASSERT_EQ(3, active_tablet_servers.size());
+
+ string leader_uuid;
+ int64_t orig_term;
+ string follower_uuid;
+ NO_FATALS(CauseFollowerToFallBehindLogGC(&leader_uuid, &orig_term, &follower_uuid));
+
+ // Wait for the abandoned follower to be evicted.
+ ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(2, tablet_servers_[leader_uuid],
+ tablet_id_, kTimeout));
+ ASSERT_EQ(1, active_tablet_servers.erase(follower_uuid));
+ ASSERT_OK(WaitForServersToAgree(kTimeout, active_tablet_servers, tablet_id_, 2));
+
+ // Now the follower is evicted and will be tombstoned.
+ // We'll add it back to the config and then crash the follower during the
+ // resulting tablet copy.
+ ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server_by_uuid(follower_uuid),
+ "tablet_copy_fault_crash_on_fetch_all", "1.0"));
+ auto* leader_ts = tablet_servers_[leader_uuid];
+ auto* follower_ts = tablet_servers_[follower_uuid];
+ ASSERT_OK(itest::AddServer(leader_ts, tablet_id_, follower_ts, RaftPeerPB::VOTER,
+ boost::none, kTimeout));
+ ASSERT_OK(cluster_->tablet_server_by_uuid(follower_uuid)->WaitForInjectedCrash(kTimeout));
+
+ // Shut down the rest of the cluster, then only bring back the node we
+ // crashed, plus one other node. The tombstoned replica should still be able
+ // to vote and the tablet should come online.
+ cluster_->Shutdown();
+
+ int follower_idx = cluster_->tablet_server_index_by_uuid(follower_uuid);
+ ASSERT_OK(inspect_->CheckTabletDataStateOnTS(follower_idx, tablet_id_, { TABLET_DATA_COPYING }));
+
+ ASSERT_OK(cluster_->master()->Restart());
+ ASSERT_OK(cluster_->tablet_server_by_uuid(leader_uuid)->Restart());
+ ASSERT_OK(cluster_->tablet_server_by_uuid(follower_uuid)->Restart());
+
+ TestWorkload workload(cluster_.get());
+ workload.set_table_name(kTableId);
+ workload.set_timeout_allowed(true);
+ workload.Setup();
+ workload.Start();
+ ASSERT_EVENTUALLY([&] {
+ ASSERT_GE(workload.rows_inserted(), 100);
+ });
+ workload.StopAndJoin();
+}
+
// Test that, after followers are evicted from the config, the master re-adds a new
// replica for that follower and it eventually catches back up.
TEST_F(RaftConsensusITest, TestMasterReplacesEvictedFollowers) {
- vector<string> ts_flags = {
- // We write 128KB cells in this test, so bump the limit.
- "--max_cell_size_bytes=1000000"
- };
+ vector<string> ts_flags;
AddFlagsForLogRolls(&ts_flags); // For CauseFollowerToFallBehindLogGC().
NO_FATALS(BuildAndStart(ts_flags));
http://git-wip-us.apache.org/repos/asf/kudu/blob/9d71f99a/src/kudu/tablet/tablet_metadata.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_metadata.cc b/src/kudu/tablet/tablet_metadata.cc
index 4ed5352..d5a271f 100644
--- a/src/kudu/tablet/tablet_metadata.cc
+++ b/src/kudu/tablet/tablet_metadata.cc
@@ -203,7 +203,9 @@ Status TabletMetadata::DeleteTabletData(TabletDataState delete_type,
}
rowsets_.clear();
tablet_data_state_ = delete_type;
- tombstone_last_logged_opid_ = last_logged_opid;
+ if (last_logged_opid) {
+ tombstone_last_logged_opid_ = last_logged_opid;
+ }
}
// Keep a copy of the old data dir group in case of flush failure.
http://git-wip-us.apache.org/repos/asf/kudu/blob/9d71f99a/src/kudu/tserver/tablet_copy_client.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_copy_client.cc b/src/kudu/tserver/tablet_copy_client.cc
index 2b894fe..6d1731e 100644
--- a/src/kudu/tserver/tablet_copy_client.cc
+++ b/src/kudu/tserver/tablet_copy_client.cc
@@ -53,6 +53,7 @@
#include "kudu/util/crc.h"
#include "kudu/util/env.h"
#include "kudu/util/env_util.h"
+#include "kudu/util/fault_injection.h"
#include "kudu/util/flag_tags.h"
#include "kudu/util/logging.h"
#include "kudu/util/monotime.h"
@@ -78,6 +79,12 @@ DEFINE_int32(tablet_copy_dowload_file_inject_latency_ms, 0,
"to take much longer. For use in tests only.");
TAG_FLAG(tablet_copy_dowload_file_inject_latency_ms, hidden);
+DEFINE_double(tablet_copy_fault_crash_on_fetch_all, 0.0,
+ "Fraction of the time that the server will crash when FetchAll() "
+ "is called on the TabletCopyClient. (For testing only!)");
+TAG_FLAG(tablet_copy_fault_crash_on_fetch_all, unsafe);
+TAG_FLAG(tablet_copy_fault_crash_on_fetch_all, runtime);
+
DECLARE_int32(tablet_copy_transfer_chunk_size_bytes);
// RETURN_NOT_OK_PREPEND() with a remote-error unwinding step.
@@ -310,6 +317,8 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr,
Status TabletCopyClient::FetchAll(const scoped_refptr<TabletReplica>& tablet_replica) {
CHECK_EQ(kStarted, state_);
+ MAYBE_FAULT(FLAGS_tablet_copy_fault_crash_on_fetch_all);
+
tablet_replica_ = tablet_replica;
// Download all the files (serially, for now, but in parallel in the future).
http://git-wip-us.apache.org/repos/asf/kudu/blob/9d71f99a/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index da40445..276c1f8 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -1091,7 +1091,7 @@ Status TSTabletManager::DeleteTabletData(
const scoped_refptr<TabletMetadata>& meta,
const scoped_refptr<consensus::ConsensusMetadataManager>& cmeta_manager,
TabletDataState delete_type,
- const boost::optional<OpId>& last_logged_opid) {
+ boost::optional<OpId> last_logged_opid) {
const string& tablet_id = meta->tablet_id();
LOG(INFO) << LogPrefix(tablet_id, meta->fs_manager())
<< "Deleting tablet data with delete state "
@@ -1105,6 +1105,7 @@ Status TSTabletManager::DeleteTabletData(
// Note: Passing an unset 'last_logged_opid' will retain the last_logged_opid
// that was previously in the metadata.
RETURN_NOT_OK(meta->DeleteTabletData(delete_type, last_logged_opid));
+ last_logged_opid = meta->tombstone_last_logged_opid();
LOG(INFO) << LogPrefix(tablet_id, meta->fs_manager())
<< "tablet deleted: last-logged OpId: "
<< (last_logged_opid ? OpIdToString(*last_logged_opid) : "(unknown)");
http://git-wip-us.apache.org/repos/asf/kudu/blob/9d71f99a/src/kudu/tserver/ts_tablet_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.h b/src/kudu/tserver/ts_tablet_manager.h
index 7257d84..86ccd06 100644
--- a/src/kudu/tserver/ts_tablet_manager.h
+++ b/src/kudu/tserver/ts_tablet_manager.h
@@ -188,7 +188,7 @@ class TSTabletManager : public tserver::TabletReplicaLookupIf {
const scoped_refptr<tablet::TabletMetadata>& meta,
const scoped_refptr<consensus::ConsensusMetadataManager>& cmeta_manager,
tablet::TabletDataState delete_type,
- const boost::optional<consensus::OpId>& last_logged_opid);
+ boost::optional<consensus::OpId> last_logged_opid);
// Forces shutdown of the tablet replicas in the data dir corresponding to 'uuid'.
void FailTabletsInDataDir(const std::string& uuid);