You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2017/08/28 19:02:42 UTC
kudu git commit: KUDU-2114. Don't re-delete tombstoned replicas

Repository: kudu
Updated Branches:
  refs/heads/master 1c70e5df8 -> 22a19d93a


KUDU-2114. Don't re-delete tombstoned replicas

A bug was introduced in 5bca7d8ba185d62952fb3e3163cbe88d20453da0 where
the master will now consider tombstoned replicas to not be deleted, and
will attempt to re-delete them when it gets a tablet report on them.
This patch fixes the problem.

Change-Id: I86e5cbe0681557ae86f5bae53f4aeb7635fa6aa6
Reviewed-on: http://gerrit.cloudera.org:8080/7842
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Tested-by: Mike Percy <mp...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/22a19d93
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/22a19d93
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/22a19d93

Branch: refs/heads/master
Commit: 22a19d93ab9f1df339b8fe0fde094c406ce5cb2b
Parents: 1c70e5d
Author: Mike Percy <mp...@apache.org>
Authored: Fri Aug 25 14:57:09 2017 -0700
Committer: Mike Percy <mp...@apache.org>
Committed: Sun Aug 27 18:09:15 2017 +0000

----------------------------------------------------------------------
 .../integration-tests/delete_table-itest.cc     | 139 ++++++++++++++++++-
 src/kudu/master/catalog_manager.cc              |  28 ++--
 2 files changed, 152 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/22a19d93/src/kudu/integration-tests/delete_table-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/delete_table-itest.cc b/src/kudu/integration-tests/delete_table-itest.cc
index 105fd77..f824db4 100644
--- a/src/kudu/integration-tests/delete_table-itest.cc
+++ b/src/kudu/integration-tests/delete_table-itest.cc
@@ -22,6 +22,7 @@
 #include <limits>
 #include <memory>
 #include <ostream>
+#include <set>
 #include <string>
 #include <tuple>  // IWYU pragma: keep
 #include <unordered_map>
@@ -43,9 +44,11 @@
 #include "kudu/common/wire_protocol-test-util.h"
 #include "kudu/common/wire_protocol.h"
 #include "kudu/common/wire_protocol.pb.h"
+#include "kudu/consensus/consensus.pb.h"
 #include "kudu/consensus/metadata.pb.h"
 #include "kudu/gutil/basictypes.h"
 #include "kudu/gutil/gscoped_ptr.h"
+#include "kudu/gutil/map-util.h"
 #include "kudu/gutil/strings/split.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/integration-tests/cluster_itest_util.h"
@@ -70,6 +73,7 @@
 #include "kudu/util/pb_util.h"
 #include "kudu/util/pstack_watcher.h"
 #include "kudu/util/status.h"
+#include "kudu/util/stopwatch.h"
 #include "kudu/util/subprocess.h"
 #include "kudu/util/test_macros.h"
 #include "kudu/util/test_util.h"
@@ -81,6 +85,7 @@ using kudu::client::KuduSchema;
 using kudu::client::KuduSchemaFromSchema;
 using kudu::client::KuduTable;
 using kudu::client::KuduTableCreator;
+using kudu::consensus::COMMITTED_OPID;
 using kudu::consensus::ConsensusMetadataPB;
 using kudu::consensus::ConsensusStatePB;
 using kudu::consensus::RaftPeerPB;
@@ -1079,7 +1084,6 @@ TEST_F(DeleteTableITest, TestWebPageForTombstonedTablet) {
   }
 }
 
-
 TEST_F(DeleteTableITest, TestUnknownTabletsAreNotDeleted) {
   // Speed up heartbeating so that the unknown tablet is detected faster.
   vector<string> extra_ts_flags = { "--heartbeat_interval_ms=10" };
@@ -1149,7 +1153,140 @@ TEST_F(DeleteTableITest, TestUnknownTabletsAreNotDeleted) {
     ASSERT_GE(num_delete_attempts, 1);
     ASSERT_OK(CheckTabletDeletedOnTS(0, tablet_id, SUPERBLOCK_NOT_EXPECTED));
   });
+}
+
+// Ensure that the master doesn't try to delete tombstoned tablets.
+// Regression test for KUDU-2114.
+TEST_F(DeleteTableITest, TestNoDeleteTombstonedTablets) {
+  if (!AllowSlowTests()) {
+    LOG(WARNING) << "This test sleeps for several seconds and only runs in slow-test mode";
+    return;
+  }
+
+  const MonoDelta kTimeout = MonoDelta::FromSeconds(30);
+  NO_FATALS(StartCluster({}, {}, /*num_tablet_servers=*/ 4));
+  const int kNumReplicas = 3;
+
+  // Create a table on the cluster. We're just using TestWorkload
+  // as a convenient way to create it.
+  TestWorkload(cluster_.get()).Setup();
+
+  // The table should have replicas on three tservers.
+  ASSERT_OK(inspect_->WaitForReplicaCount(kNumReplicas));
+  master::GetTableLocationsResponsePB table_locations;
+  ASSERT_OK(itest::GetTableLocations(cluster_->master_proxy(), TestWorkload::kDefaultTableName,
+                                     kTimeout, &table_locations));
+  ASSERT_EQ(1, table_locations.tablet_locations_size()); // Only 1 tablet.
+  string tablet_id;
+  std::set<string> replicas;
+  for (const auto& t : table_locations.tablet_locations()) {
+    tablet_id = t.tablet_id();
+    for (const auto& r : t.replicas()) {
+      replicas.insert(r.ts_info().permanent_uuid());
+    }
+  }
+
+  LOG(INFO) << "finding leader...";
+
+  // Find leader.
+  TServerDetails* leader = nullptr;
+  ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, kTimeout, &leader));
+  ASSERT_OK(WaitForOpFromCurrentTerm(leader, tablet_id, COMMITTED_OPID, kTimeout));
+
+  LOG(INFO) << "replacing replica...";
+
+  // Prepare to replace a replica.
+  TServerDetails* to_add = nullptr;
+  TServerDetails* to_remove = nullptr;
+  int to_remove_index = -1;
+  ASSERT_NE(nullptr, leader);
+  for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
+    TServerDetails* ts = ts_map_[cluster_->tablet_server(i)->uuid()];
+    if (!ContainsKey(replicas, cluster_->tablet_server(i)->uuid())) {
+      to_add = ts;
+    } else {
+      if (ts == leader) continue;
+      to_remove = ts;
+      to_remove_index = i;
+    }
+  }
+  ASSERT_NE(nullptr, to_add);
+  ASSERT_NE(nullptr, to_remove);
+
+  // Do the config changes and wait for the master to delete the old node.
+  ASSERT_OK(AddServer(leader, tablet_id, to_add, RaftPeerPB::VOTER, boost::none, kTimeout));
+  ASSERT_OK(RemoveServer(leader, tablet_id, to_remove, boost::none, kTimeout));
+
+  LOG(INFO) << "waiting for no data on ts...";
+
+  // Most of the time, we will only get 1 deletion attempt when a replica is
+  // evicted, but to avoid flakiness in this test we leave some slack for a
+  // couple more due to multiple requests coming in at once, such as the config
+  // change plus a couple of heartbeats from the evicted node. If we set this
+  // to 1, this test ends up being a few percent flaky under stress.
+  const int kMaxDeleteAttemptsPerEviction = 3;
+
+  // 'to_remove' should eventually have no tablets and the metrics should show
+  // a deleted tablet.
+  ASSERT_EVENTUALLY([&] {
+    inspect_->WaitForNoDataOnTS(to_remove_index, kTimeout);
+
+    int64_t num_delete_attempts;
+    ASSERT_OK(cluster_->tablet_server(to_remove_index)->GetInt64Metric(
+        &METRIC_ENTITY_server, "kudu.tabletserver",
+        &METRIC_handler_latency_kudu_tserver_TabletServerAdminService_DeleteTablet,
+        "total_count", &num_delete_attempts));
+    ASSERT_GE(num_delete_attempts, 1);
+    ASSERT_LE(num_delete_attempts, kMaxDeleteAttemptsPerEviction);
+  });
+
+  // Wait for the metrics to change without restarting, and then do the same
+  // check after a restart, to test both the code paths in the master (one
+  // related to config change, one related to receiving a full tablet report).
+  for (int i = 0; i < 2; i++) {
+    LOG(INFO) << "iter " << i << "...";
+    int max_expected_deletes = kMaxDeleteAttemptsPerEviction;
+    if (i == 1) {
+      LOG(INFO) << "restarting cluster...";
+      cluster_->Shutdown();
+      ASSERT_OK(cluster_->Restart());
+      max_expected_deletes = 0; // There should be no attempts in this case.
+    }
+
+    LOG(INFO) << "waiting for metrics...";
 
+    SCOPED_TRACE(i);
+    SCOPED_TRACE(max_expected_deletes);
+
+    int64_t prev_heartbeats;
+    ASSERT_OK(cluster_->master()->GetInt64Metric(
+        &METRIC_ENTITY_server, "kudu.master",
+        &METRIC_handler_latency_kudu_master_MasterService_TSHeartbeat,
+        "total_count",
+        &prev_heartbeats));
+
+    // Wait until several heartbeats have been received by the master.
+    // We should still have the expected number of delete requests.
+    SCOPED_LOG_TIMING(INFO, "waiting for heartbeats");
+    ASSERT_EVENTUALLY([&] {
+      int64_t num_delete_attempts;
+      ASSERT_OK(cluster_->tablet_server(to_remove_index)->GetInt64Metric(
+          &METRIC_ENTITY_server, "kudu.tabletserver",
+          &METRIC_handler_latency_kudu_tserver_TabletServerAdminService_DeleteTablet,
+          "total_count",
+          &num_delete_attempts));
+      ASSERT_LE(num_delete_attempts, max_expected_deletes);
+
+      int64_t num_heartbeats;
+      ASSERT_OK(cluster_->master()->GetInt64Metric(
+          &METRIC_ENTITY_server, "kudu.master",
+          &METRIC_handler_latency_kudu_master_MasterService_TSHeartbeat,
+          "total_count",
+          &num_heartbeats));
+      const int kHeartbeatsToWait = 3;
+      ASSERT_GE(num_heartbeats, prev_heartbeats + (kNumReplicas * kHeartbeatsToWait));
+    });
+  }
 }
 
 // Parameterized test case for TABLET_DATA_DELETED deletions.

http://git-wip-us.apache.org/repos/asf/kudu/blob/22a19d93/src/kudu/master/catalog_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc
index bfbbe7b..517082c 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -2477,7 +2477,6 @@ Status CatalogManager::HandleReportedTablet(TSDescriptor* ts_desc,
     tablet_needs_alter = true;
   }
 
-
   if (report.has_error()) {
     Status s = StatusFromPB(report.error());
     DCHECK(!s.ok());
@@ -2486,21 +2485,22 @@ Status CatalogManager::HandleReportedTablet(TSDescriptor* ts_desc,
     return Status::OK();
   }
 
-  // The report will not have a consensus_state if it is in the
-  // middle of starting up, such as during tablet bootstrap.
+  // The report may have a consensus_state even when tombstoned.
   if (report.has_consensus_state()) {
     const ConsensusStatePB& prev_cstate = tablet_lock.data().pb.consensus_state();
     ConsensusStatePB cstate = report.consensus_state();
 
-    // Check if we got a report from a tablet that is no longer part of the raft
-    // config. If so, tombstone it. We only tombstone replicas that include a
-    // committed raft config in their report that has an opid_index strictly
-    // less than the latest reported committed config, and (obviously) who are
-    // not members of the latest config. This prevents us from spuriously
-    // deleting replicas that have just been added to a pending config and are
-    // in the process of catching up to the log entry where they were added to
-    // the config.
+    // Check if we got a report from a tablet that is no longer part of the Raft
+    // config, but is not already TOMBSTONED / DELETED. If so, tombstone it. We
+    // only tombstone replicas that include a committed raft config in their
+    // report that has an opid_index strictly less than the latest reported
+    // committed config, and (obviously) who are not members of the latest
+    // config. This prevents us from spuriously deleting replicas that have
+    // just been added to a pending config and are in the process of catching
+    // up to the log entry where they were added to the config.
     if (FLAGS_master_tombstone_evicted_tablet_replicas &&
+        report.tablet_data_state() != TABLET_DATA_TOMBSTONED &&
+        report.tablet_data_state() != TABLET_DATA_DELETED &&
         cstate.committed_config().opid_index() < prev_cstate.committed_config().opid_index() &&
         !IsRaftConfigMember(ts_desc->permanent_uuid(), prev_cstate.committed_config())) {
       SendDeleteReplicaRequest(report.tablet_id(), TABLET_DATA_TOMBSTONED,
@@ -2528,9 +2528,9 @@ Status CatalogManager::HandleReportedTablet(TSDescriptor* ts_desc,
       DCHECK_EQ(SysTabletsEntryPB::CREATING, tablet_lock.data().pb.state())
           << Substitute("Tablet in unexpected state: $0: $1", tablet->ToString(),
                         SecureShortDebugString(tablet_lock.data().pb));
-      // Mark the tablet as running
-      // TODO: we could batch the IO onto a background thread, or at least
-      // across multiple tablets in the same report.
+      // Mark the tablet as running.
+      // TODO(matteo): we could batch the IO onto a background thread, or at
+      // least across multiple tablets in the same report.
       VLOG(1) << Substitute("Tablet $0 is now online", tablet->ToString());
       tablet_lock.mutable_data()->set_state(SysTabletsEntryPB::RUNNING,
                                             "Tablet reported with an active leader");