You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2017/09/07 23:50:37 UTC

kudu git commit: KUDU-2138 delete failed replicas in tablet report

Repository: kudu
Updated Branches:
  refs/heads/master 2f2565b25 -> cd6eee7a3


KUDU-2138 delete failed replicas in tablet report

When a Kudu master processes a replica's tablet report, it will send a
DeleteTablet RPC if the replica is not already deleted, has a consensus
state, has a lower opid index than the latest reported committed config,
and is no longer in the tablet's Raft configuration.

Failed tablets are handled specially in processing the report to return
before any of the above checks can be made. This can lead to failed
tablets lingering when they should actually be deleted.

This patch fixes this by performing these checks and sending the delete
regardless of whether or not the tablet is failed (other than checking
the opid index, since failed tablets do not report an opid index).

A test is added to tablet_replacement-itest to fail a replica, evict but
not tombstone it, report it to the master, and ensure it is properly
deleted.

Change-Id: I34abd2cf4d00098a28dd7645053ccdc8341df03c
Reviewed-on: http://gerrit.cloudera.org:8080/7992
Tested-by: Kudu Jenkins
Reviewed-by: Mike Percy <mp...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/cd6eee7a
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/cd6eee7a
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/cd6eee7a

Branch: refs/heads/master
Commit: cd6eee7a375e053f1f9bed338e603bf9ca91e709
Parents: 2f2565b
Author: Andrew Wong <aw...@cloudera.com>
Authored: Thu Sep 7 00:20:33 2017 -0700
Committer: Mike Percy <mp...@apache.org>
Committed: Thu Sep 7 23:50:08 2017 +0000

----------------------------------------------------------------------
 .../tablet_replacement-itest.cc                 | 74 ++++++++++++++++++++
 src/kudu/master/catalog_manager.cc              | 51 ++++++++------
 2 files changed, 102 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/cd6eee7a/src/kudu/integration-tests/tablet_replacement-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/tablet_replacement-itest.cc b/src/kudu/integration-tests/tablet_replacement-itest.cc
index ff55e59..ab68318 100644
--- a/src/kudu/integration-tests/tablet_replacement-itest.cc
+++ b/src/kudu/integration-tests/tablet_replacement-itest.cc
@@ -20,6 +20,7 @@
 #include <ostream>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include <boost/bind.hpp>
@@ -141,6 +142,79 @@ TEST_F(TabletReplacementITest, TestMasterTombstoneEvictedReplica) {
   ASSERT_OK(inspect_->CheckTabletDataStateOnTS(kFollowerIndex, tablet_id, { TABLET_DATA_READY }));
 }
 
+// Test for KUDU-2138: ensure that the master will tombstone failed tablets
+// that have previously been evicted.
+TEST_F(TabletReplacementITest, TestMasterTombstoneFailedEvictedReplicaOnReport) {
+  const MonoDelta kTimeout = MonoDelta::FromSeconds(30);
+  const int kNumServers = 4;
+  NO_FATALS(StartCluster({"--follower_unavailable_considered_failed_sec=5"},
+      {"--master_tombstone_evicted_tablet_replicas=false"}, kNumServers));
+
+  TestWorkload workload(cluster_.get());
+  workload.Setup(); // Easy way to create a new tablet.
+
+  // Determine the tablet id.
+  string tablet_id;
+  ASSERT_EVENTUALLY([&] {
+      vector<string> tablets = inspect_->ListTablets();
+      ASSERT_FALSE(tablets.empty());
+      tablet_id = tablets[0];
+  });
+
+  // Determine which tablet servers have data. One should be empty.
+  unordered_map<string, itest::TServerDetails*> active_ts_map = ts_map_;
+  int empty_server_idx = -1;
+  string empty_ts_uuid;
+  consensus::ConsensusMetadataPB cmeta_pb;
+  for (int i = 0; i < kNumServers; i++) {
+    consensus::ConsensusMetadataPB cmeta_pb;
+    if (inspect_->ReadConsensusMetadataOnTS(i, tablet_id, &cmeta_pb).IsNotFound()) {
+      empty_ts_uuid = cluster_->tablet_server(i)->uuid();
+      ASSERT_EQ(1, active_ts_map.erase(empty_ts_uuid));
+      empty_server_idx = i;
+      break;
+    }
+  }
+  ASSERT_NE(empty_server_idx, -1);
+
+  // Wait until all replicas are up and running.
+  for (const auto& e : active_ts_map) {
+    ASSERT_OK(itest::WaitUntilTabletRunning(e.second, tablet_id, kTimeout));
+  }
+
+  // Select a replica to fail by shutting it down and mucking with its
+  // metadata. When it restarts, it will fail to open.
+  int idx_to_fail = (empty_server_idx + 1) % kNumServers;
+  auto* ts = cluster_->tablet_server(idx_to_fail);
+  ts->Shutdown();
+  ASSERT_OK(inspect_->ReadConsensusMetadataOnTS(idx_to_fail, tablet_id, &cmeta_pb));
+  cmeta_pb.set_current_term(-1);
+  ASSERT_OK(inspect_->WriteConsensusMetadataOnTS(idx_to_fail, tablet_id, cmeta_pb));
+
+  // Wait until the replica is evicted and replicated to the empty server.
+  ASSERT_OK(WaitUntilTabletInState(ts_map_[empty_ts_uuid],
+                                   tablet_id,
+                                   tablet::RUNNING,
+                                   kTimeout));
+
+  // Restart the tserver and ensure the tablet is failed.
+  ASSERT_OK(ts->Restart());
+  ASSERT_OK(WaitUntilTabletInState(ts_map_[ts->uuid()],
+                                   tablet_id,
+                                   tablet::FAILED,
+                                   kTimeout));
+
+  // Upon restarting, the master will request a report and notice the failed
+  // replica. Wait for the master to tombstone the failed follower.
+  cluster_->master()->Shutdown();
+  cluster_->master()->mutable_flags()->emplace_back(
+      "--master_tombstone_evicted_tablet_replicas=true");
+  ASSERT_OK(cluster_->master()->Restart());
+  ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(idx_to_fail, tablet_id,
+                                                 { TABLET_DATA_TOMBSTONED },
+                                                 kTimeout));
+}
+
 // Ensure that the Master will tombstone a replica if it reports in with an old
 // config. This tests a slightly different code path in the catalog manager
 // than TestMasterTombstoneEvictedReplica does.

http://git-wip-us.apache.org/repos/asf/kudu/blob/cd6eee7a/src/kudu/master/catalog_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc
index ef69cf5..4394058 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -2467,6 +2467,34 @@ Status CatalogManager::HandleReportedTablet(TSDescriptor* ts_desc,
     tablet_needs_alter = true;
   }
 
+  // Check if we got a report from a tablet that is no longer part of the Raft
+  // config, but is not already TOMBSTONED / DELETED. If so, tombstone it.
+  //
+  // If the report includes a committed raft config, a delete is only sent if
+  // the opid_index is strictly less than the latest reported committed config.
+  // This prevents us from spuriously deleting replicas that have just been
+  // added to a pending config and are in the process of catching up to the log
+  // entry where they were added to the config.
+  const ConsensusStatePB& prev_cstate = tablet_lock.data().pb.consensus_state();
+  const int64_t prev_opid_index = prev_cstate.committed_config().opid_index();
+  const int64_t report_opid_index = report.has_consensus_state() ?
+      report.consensus_state().committed_config().opid_index() : consensus::kInvalidOpIdIndex;
+  if (FLAGS_master_tombstone_evicted_tablet_replicas &&
+      report.tablet_data_state() != TABLET_DATA_TOMBSTONED &&
+      report.tablet_data_state() != TABLET_DATA_DELETED &&
+      !IsRaftConfigMember(ts_desc->permanent_uuid(), prev_cstate.committed_config()) &&
+      report_opid_index < prev_opid_index) {
+    const string delete_msg = report_opid_index == consensus::kInvalidOpIdIndex ?
+        "Replica has no consensus available" :
+        Substitute("Replica with old config index $0", report_opid_index);
+    SendDeleteReplicaRequest(report.tablet_id(), TABLET_DATA_TOMBSTONED, prev_opid_index,
+        tablet->table(), ts_desc->permanent_uuid(),
+        Substitute("$0 (current committed config index is $1)", delete_msg, prev_opid_index));
+    return Status::OK();
+  }
+
+  // If the tablet has an error and doesn't need to be deleted, there's not
+  // much we can do.
   if (report.has_error()) {
     Status s = StatusFromPB(report.error());
     DCHECK(!s.ok());
@@ -2477,31 +2505,8 @@ Status CatalogManager::HandleReportedTablet(TSDescriptor* ts_desc,
 
   // The report may have a consensus_state even when tombstoned.
   if (report.has_consensus_state()) {
-    const ConsensusStatePB& prev_cstate = tablet_lock.data().pb.consensus_state();
     ConsensusStatePB cstate = report.consensus_state();
 
-    // Check if we got a report from a tablet that is no longer part of the Raft
-    // config, but is not already TOMBSTONED / DELETED. If so, tombstone it. We
-    // only tombstone replicas that include a committed raft config in their
-    // report that has an opid_index strictly less than the latest reported
-    // committed config, and (obviously) who are not members of the latest
-    // config. This prevents us from spuriously deleting replicas that have
-    // just been added to a pending config and are in the process of catching
-    // up to the log entry where they were added to the config.
-    if (FLAGS_master_tombstone_evicted_tablet_replicas &&
-        report.tablet_data_state() != TABLET_DATA_TOMBSTONED &&
-        report.tablet_data_state() != TABLET_DATA_DELETED &&
-        cstate.committed_config().opid_index() < prev_cstate.committed_config().opid_index() &&
-        !IsRaftConfigMember(ts_desc->permanent_uuid(), prev_cstate.committed_config())) {
-      SendDeleteReplicaRequest(report.tablet_id(), TABLET_DATA_TOMBSTONED,
-                               prev_cstate.committed_config().opid_index(),
-                               tablet->table(), ts_desc->permanent_uuid(),
-                               Substitute("Replica from old config with index $0 (latest is $1)",
-                                          cstate.committed_config().opid_index(),
-                                          prev_cstate.committed_config().opid_index()));
-      return Status::OK();
-    }
-
     // If the reported leader is not a member of the committed config, then we
     // disregard the leader state.
     if (cstate.has_leader_uuid() &&