You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by da...@apache.org on 2017/08/31 04:02:16 UTC

[2/3] kudu git commit: consensus: Save previous last-logged OpId across tablet copies

consensus: Save previous last-logged OpId across tablet copies

When a replica falls behind the leader's log, and then fails a tablet
copy, it should still be able to vote in leader elections while
tombstoned. With this patch, that becomes possible. This helps with
availability.

Without this patch, asking a post-tablet copy failure replica to vote
would result in the following RPC response:

IllegalState: must be running to vote when last-logged opid is not known

Change-Id: Ie886764adbdc6d84406df44d6cdb693d038bcd57
Reviewed-on: http://gerrit.cloudera.org:8080/7888
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <as...@cloudera.com>
Reviewed-by: David Ribeiro Alves <da...@gmail.com>
Reviewed-on: http://gerrit.cloudera.org:8080/7912
Reviewed-by: Dan Burkert <da...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/9d71f99a
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/9d71f99a
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/9d71f99a

Branch: refs/heads/branch-1.5.x
Commit: 9d71f99a9c54e20ff5ab76d91ad6e41b05fc5467
Parents: 0dbc67c
Author: Mike Percy <mp...@apache.org>
Authored: Tue Aug 29 14:20:24 2017 -0700
Committer: Dan Burkert <da...@apache.org>
Committed: Thu Aug 31 04:01:39 2017 +0000

----------------------------------------------------------------------
 .../integration-tests/raft_consensus-itest.cc   | 76 +++++++++++++++++---
 src/kudu/tablet/tablet_metadata.cc              |  4 +-
 src/kudu/tserver/tablet_copy_client.cc          |  9 +++
 src/kudu/tserver/ts_tablet_manager.cc           |  3 +-
 src/kudu/tserver/ts_tablet_manager.h            |  2 +-
 5 files changed, 81 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/9d71f99a/src/kudu/integration-tests/raft_consensus-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest.cc b/src/kudu/integration-tests/raft_consensus-itest.cc
index 91fa7b4..8e4cf8a 100644
--- a/src/kudu/integration-tests/raft_consensus-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest.cc
@@ -141,6 +141,7 @@ using std::unordered_map;
 using std::unordered_set;
 using std::vector;
 using strings::Substitute;
+using tablet::TABLET_DATA_COPYING;
 
 static const int kConsensusRpcTimeoutForTests = 50;
 
@@ -446,6 +447,8 @@ void RaftConsensusITest::AddFlagsForLogRolls(vector<string>* extra_tserver_flags
   extra_tserver_flags->push_back("--log_max_segments_to_retain=3");
   extra_tserver_flags->push_back("--maintenance_manager_polling_interval_ms=100");
   extra_tserver_flags->push_back("--log_target_replay_size_mb=1");
+  // We write 128KB cells in CauseFollowerToFallBehindLogGC(): bump the limit.
+  extra_tserver_flags->push_back("--max_cell_size_bytes=1000000");
 }
 
 // Test that we can retrieve the permanent uuid of a server running
@@ -831,8 +834,6 @@ TEST_F(RaftConsensusITest, TestFollowerFallsBehindLeaderGC) {
   vector<string> ts_flags = {
     // Disable follower eviction to maintain the original intent of this test.
     "--evict_failed_followers=false",
-    // We write 128KB cells in this test, so bump the limit.
-    "--max_cell_size_bytes=1000000"
   };
   AddFlagsForLogRolls(&ts_flags); // For CauseFollowerToFallBehindLogGC().
   NO_FATALS(BuildAndStart(ts_flags));
@@ -2813,10 +2814,7 @@ TEST_F(RaftConsensusITest, TestHammerOneRow) {
 // Test that followers that fall behind the leader's log GC threshold are
 // evicted from the config.
 TEST_F(RaftConsensusITest, TestEvictAbandonedFollowers) {
-  vector<string> ts_flags = {
-    // We write 128KB cells in this test, so bump the limit.
-    "--max_cell_size_bytes=1000000"
-  };
+  vector<string> ts_flags;
   AddFlagsForLogRolls(&ts_flags); // For CauseFollowerToFallBehindLogGC().
   vector<string> master_flags = {
     "--master_add_server_when_underreplicated=false",
@@ -2840,13 +2838,71 @@ TEST_F(RaftConsensusITest, TestEvictAbandonedFollowers) {
   ASSERT_OK(WaitForServersToAgree(timeout, active_tablet_servers, tablet_id_, 2));
 }
 
+// Have a replica fall behind the leader's log, then fail a tablet copy. It
+// should still be able to vote in leader elections.
+TEST_F(RaftConsensusITest, TestTombstonedVoteAfterFailedTabletCopy) {
+  const MonoDelta kTimeout = MonoDelta::FromSeconds(30);
+
+  vector<string> ts_flags;
+  AddFlagsForLogRolls(&ts_flags); // For CauseFollowerToFallBehindLogGC().
+  const vector<string> kMasterFlags = {
+    "--master_add_server_when_underreplicated=false",
+  };
+  NO_FATALS(BuildAndStart(ts_flags, kMasterFlags));
+
+  TabletServerMap active_tablet_servers = tablet_servers_;
+  ASSERT_EQ(3, FLAGS_num_replicas);
+  ASSERT_EQ(3, active_tablet_servers.size());
+
+  string leader_uuid;
+  int64_t orig_term;
+  string follower_uuid;
+  NO_FATALS(CauseFollowerToFallBehindLogGC(&leader_uuid, &orig_term, &follower_uuid));
+
+  // Wait for the abandoned follower to be evicted.
+  ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(2, tablet_servers_[leader_uuid],
+                                                tablet_id_, kTimeout));
+  ASSERT_EQ(1, active_tablet_servers.erase(follower_uuid));
+  ASSERT_OK(WaitForServersToAgree(kTimeout, active_tablet_servers, tablet_id_, 2));
+
+  // Now the follower is evicted and will be tombstoned.
+  // We'll add it back to the config and then crash the follower during the
+  // resulting tablet copy.
+  ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server_by_uuid(follower_uuid),
+            "tablet_copy_fault_crash_on_fetch_all", "1.0"));
+  auto* leader_ts = tablet_servers_[leader_uuid];
+  auto* follower_ts = tablet_servers_[follower_uuid];
+  ASSERT_OK(itest::AddServer(leader_ts, tablet_id_, follower_ts, RaftPeerPB::VOTER,
+                             boost::none, kTimeout));
+  ASSERT_OK(cluster_->tablet_server_by_uuid(follower_uuid)->WaitForInjectedCrash(kTimeout));
+
+  // Shut down the rest of the cluster, then only bring back the node we
+  // crashed, plus one other node. The tombstoned replica should still be able
+  // to vote and the tablet should come online.
+  cluster_->Shutdown();
+
+  int follower_idx = cluster_->tablet_server_index_by_uuid(follower_uuid);
+  ASSERT_OK(inspect_->CheckTabletDataStateOnTS(follower_idx, tablet_id_, { TABLET_DATA_COPYING }));
+
+  ASSERT_OK(cluster_->master()->Restart());
+  ASSERT_OK(cluster_->tablet_server_by_uuid(leader_uuid)->Restart());
+  ASSERT_OK(cluster_->tablet_server_by_uuid(follower_uuid)->Restart());
+
+  TestWorkload workload(cluster_.get());
+  workload.set_table_name(kTableId);
+  workload.set_timeout_allowed(true);
+  workload.Setup();
+  workload.Start();
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_GE(workload.rows_inserted(), 100);
+  });
+  workload.StopAndJoin();
+}
+
 // Test that, after followers are evicted from the config, the master re-adds a new
 // replica for that follower and it eventually catches back up.
 TEST_F(RaftConsensusITest, TestMasterReplacesEvictedFollowers) {
-  vector<string> ts_flags = {
-    // We write 128KB cells in this test, so bump the limit.
-    "--max_cell_size_bytes=1000000"
-  };
+  vector<string> ts_flags;
   AddFlagsForLogRolls(&ts_flags); // For CauseFollowerToFallBehindLogGC().
   NO_FATALS(BuildAndStart(ts_flags));
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/9d71f99a/src/kudu/tablet/tablet_metadata.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_metadata.cc b/src/kudu/tablet/tablet_metadata.cc
index 4ed5352..d5a271f 100644
--- a/src/kudu/tablet/tablet_metadata.cc
+++ b/src/kudu/tablet/tablet_metadata.cc
@@ -203,7 +203,9 @@ Status TabletMetadata::DeleteTabletData(TabletDataState delete_type,
     }
     rowsets_.clear();
     tablet_data_state_ = delete_type;
-    tombstone_last_logged_opid_ = last_logged_opid;
+    if (last_logged_opid) {
+      tombstone_last_logged_opid_ = last_logged_opid;
+    }
   }
 
   // Keep a copy of the old data dir group in case of flush failure.

http://git-wip-us.apache.org/repos/asf/kudu/blob/9d71f99a/src/kudu/tserver/tablet_copy_client.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_copy_client.cc b/src/kudu/tserver/tablet_copy_client.cc
index 2b894fe..6d1731e 100644
--- a/src/kudu/tserver/tablet_copy_client.cc
+++ b/src/kudu/tserver/tablet_copy_client.cc
@@ -53,6 +53,7 @@
 #include "kudu/util/crc.h"
 #include "kudu/util/env.h"
 #include "kudu/util/env_util.h"
+#include "kudu/util/fault_injection.h"
 #include "kudu/util/flag_tags.h"
 #include "kudu/util/logging.h"
 #include "kudu/util/monotime.h"
@@ -78,6 +79,12 @@ DEFINE_int32(tablet_copy_dowload_file_inject_latency_ms, 0,
              "to take much longer. For use in tests only.");
 TAG_FLAG(tablet_copy_dowload_file_inject_latency_ms, hidden);
 
+DEFINE_double(tablet_copy_fault_crash_on_fetch_all, 0.0,
+              "Fraction of the time that the server will crash when FetchAll() "
+              "is called on the TabletCopyClient. (For testing only!)");
+TAG_FLAG(tablet_copy_fault_crash_on_fetch_all, unsafe);
+TAG_FLAG(tablet_copy_fault_crash_on_fetch_all, runtime);
+
 DECLARE_int32(tablet_copy_transfer_chunk_size_bytes);
 
 // RETURN_NOT_OK_PREPEND() with a remote-error unwinding step.
@@ -310,6 +317,8 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr,
 Status TabletCopyClient::FetchAll(const scoped_refptr<TabletReplica>& tablet_replica) {
   CHECK_EQ(kStarted, state_);
 
+  MAYBE_FAULT(FLAGS_tablet_copy_fault_crash_on_fetch_all);
+
   tablet_replica_ = tablet_replica;
 
   // Download all the files (serially, for now, but in parallel in the future).

http://git-wip-us.apache.org/repos/asf/kudu/blob/9d71f99a/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index da40445..276c1f8 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -1091,7 +1091,7 @@ Status TSTabletManager::DeleteTabletData(
     const scoped_refptr<TabletMetadata>& meta,
     const scoped_refptr<consensus::ConsensusMetadataManager>& cmeta_manager,
     TabletDataState delete_type,
-    const boost::optional<OpId>& last_logged_opid) {
+    boost::optional<OpId> last_logged_opid) {
   const string& tablet_id = meta->tablet_id();
   LOG(INFO) << LogPrefix(tablet_id, meta->fs_manager())
             << "Deleting tablet data with delete state "
@@ -1105,6 +1105,7 @@ Status TSTabletManager::DeleteTabletData(
   // Note: Passing an unset 'last_logged_opid' will retain the last_logged_opid
   // that was previously in the metadata.
   RETURN_NOT_OK(meta->DeleteTabletData(delete_type, last_logged_opid));
+  last_logged_opid = meta->tombstone_last_logged_opid();
   LOG(INFO) << LogPrefix(tablet_id, meta->fs_manager())
             << "tablet deleted: last-logged OpId: "
             << (last_logged_opid ? OpIdToString(*last_logged_opid) : "(unknown)");

http://git-wip-us.apache.org/repos/asf/kudu/blob/9d71f99a/src/kudu/tserver/ts_tablet_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.h b/src/kudu/tserver/ts_tablet_manager.h
index 7257d84..86ccd06 100644
--- a/src/kudu/tserver/ts_tablet_manager.h
+++ b/src/kudu/tserver/ts_tablet_manager.h
@@ -188,7 +188,7 @@ class TSTabletManager : public tserver::TabletReplicaLookupIf {
       const scoped_refptr<tablet::TabletMetadata>& meta,
       const scoped_refptr<consensus::ConsensusMetadataManager>& cmeta_manager,
       tablet::TabletDataState delete_type,
-      const boost::optional<consensus::OpId>& last_logged_opid);
+      boost::optional<consensus::OpId> last_logged_opid);
 
   // Forces shutdown of the tablet replicas in the data dir corresponding to 'uuid'.
   void FailTabletsInDataDir(const std::string& uuid);