You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by da...@apache.org on 2017/08/31 04:02:15 UTC

[1/3] kudu git commit: KUDU-2118. Fully shut down TabletReplica on delete

Repository: kudu
Updated Branches:
  refs/heads/branch-1.5.x 333b57822 -> 2de07e594


KUDU-2118. Fully shut down TabletReplica on delete

After the various lifecycle changes implemented in
5bca7d8ba185d62952fb3e3163cbe88d20453da0 we were left with a case where
we fail to fully shut down TABLET_DATA_DELETED replicas before
dereferencing them. Because LeaderElection via ElectionDecisionCallback
will hold a reference to RaftConsensus while a vote is taking place, the
leader election callback running on the reactor thread may destroy the
RaftConsensus object. If that occurs without RaftConsensus already being
in a fully-shutdown state, the RaftConsensus destructor runs Shutdown()
before destoying the object, which takes a lock and triggers an
assertion via AssertWaitAllowed(). This crashes the server.

This patch fixes the issue by calling Shutdown() on TABLET_DATA_DELETED
replicas before dereferencing them in TSTabletManager.

A concurrency test is included that spawns threads to start elections
while the tablets are being deleted. This test pretty reliably fails
without the included fix, and is solidly stable with the fix.

Change-Id: I4a0ea46f220a10d4de680fc73d45d14426e0395e
Reviewed-on: http://gerrit.cloudera.org:8080/7883
Tested-by: Kudu Jenkins
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Reviewed-on: http://gerrit.cloudera.org:8080/7911
Reviewed-by: Dan Burkert <da...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/0dbc67c6
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/0dbc67c6
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/0dbc67c6

Branch: refs/heads/branch-1.5.x
Commit: 0dbc67c64debe6d8893a06021a0d14bbc54efd0d
Parents: 333b578
Author: Mike Percy <mp...@apache.org>
Authored: Tue Aug 29 13:44:46 2017 -0700
Committer: Dan Burkert <da...@apache.org>
Committed: Thu Aug 31 04:01:30 2017 +0000

----------------------------------------------------------------------
 .../integration-tests/delete_tablet-itest.cc    | 55 +++++++++++++++++++-
 src/kudu/tserver/ts_tablet_manager.cc           |  9 ++--
 2 files changed, 59 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/0dbc67c6/src/kudu/integration-tests/delete_tablet-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/delete_tablet-itest.cc b/src/kudu/integration-tests/delete_tablet-itest.cc
index 447a4cf..0dce3fb 100644
--- a/src/kudu/integration-tests/delete_tablet-itest.cc
+++ b/src/kudu/integration-tests/delete_tablet-itest.cc
@@ -15,13 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <atomic>
 #include <cstdint>
 #include <memory>
+#include <string>
+#include <thread>
 #include <unordered_map>
 #include <vector>
 
 #include <boost/optional/optional.hpp>
 #include <gflags/gflags_declare.h>
+#include <glog/stl_logging.h>
 #include <gtest/gtest.h>
 
 #include "kudu/gutil/ref_counted.h"
@@ -43,7 +47,11 @@
 
 DECLARE_int64(fs_wal_dir_reserved_bytes);
 
+using kudu::tablet::TABLET_DATA_DELETED;
 using kudu::tablet::TabletReplica;
+using std::atomic;
+using std::string;
+using std::thread;
 using std::vector;
 
 namespace kudu {
@@ -53,7 +61,7 @@ class DeleteTabletITest : public MiniClusterITestBase {
 
 // Test deleting a failed replica. Regression test for KUDU-1607.
 TEST_F(DeleteTabletITest, TestDeleteFailedReplica) {
-  NO_FATALS(StartCluster(1)); // 1 TS.
+  NO_FATALS(StartCluster(/*num_tablet_servers=*/ 1));
   TestWorkload workload(cluster_.get());
   workload.set_num_replicas(1);
   workload.Setup();
@@ -111,4 +119,49 @@ TEST_F(DeleteTabletITest, TestDeleteFailedReplica) {
   });
 }
 
+// Test that a leader election will not cause a crash when a tablet is deleted.
+// Regression test for KUDU-2118.
+TEST_F(DeleteTabletITest, TestLeaderElectionDuringDeleteTablet) {
+  const MonoDelta kTimeout = MonoDelta::FromSeconds(30);
+  const int kNumTablets = 10;
+  NO_FATALS(StartCluster());
+  TestWorkload workload(cluster_.get());
+  workload.set_num_tablets(kNumTablets);
+  workload.Setup();
+  vector<string> tablet_ids;
+  ASSERT_EVENTUALLY([&] {
+    tablet_ids = cluster_->mini_tablet_server(0)->ListTablets();
+    ASSERT_EQ(kNumTablets, tablet_ids.size()) << tablet_ids;
+  });
+
+  // Start threads that start leader elections.
+  vector<thread> threads;
+  atomic<bool> running(true);
+  auto* ts = ts_map_[cluster_->mini_tablet_server(0)->uuid()];
+  for (const string& tablet_id : tablet_ids) {
+    threads.emplace_back([ts, tablet_id, &running, &kTimeout] {
+      while (running) {
+        itest::StartElection(ts, tablet_id, kTimeout); // Ignore result.
+      }
+    });
+  }
+
+  // Sequentially delete all of the tablets.
+  for (const string& tablet_id : tablet_ids) {
+    ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_DELETED, boost::none, kTimeout));
+  }
+
+  // Wait until all tablets are deleted.
+  ASSERT_EVENTUALLY([&] {
+    tablet_ids = cluster_->mini_tablet_server(0)->ListTablets();
+    ASSERT_EQ(0, tablet_ids.size()) << tablet_ids;
+  });
+
+  // Stop all the election threads.
+  running = false;
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
 } // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/0dbc67c6/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index b014aab..da40445 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -657,8 +657,8 @@ Status TSTabletManager::DeleteTablet(
   // If the tablet is already deleted, the CAS check isn't possible because
   // consensus and therefore the log is not available.
   TabletDataState data_state = replica->tablet_metadata()->tablet_data_state();
-  bool tablet_deleted = (data_state == TABLET_DATA_DELETED ||
-                         data_state == TABLET_DATA_TOMBSTONED);
+  bool tablet_already_deleted = (data_state == TABLET_DATA_DELETED ||
+                                 data_state == TABLET_DATA_TOMBSTONED);
 
   // They specified an "atomic" delete. Check the committed config's opid_index.
   // TODO(mpercy): There's actually a race here between the check and shutdown,
@@ -666,7 +666,7 @@ Status TSTabletManager::DeleteTablet(
   // restarting the tablet if the local replica committed a higher config change
   // op during that time, or potentially something else more invasive.
   shared_ptr<RaftConsensus> consensus = replica->shared_consensus();
-  if (cas_config_opid_index_less_or_equal && !tablet_deleted) {
+  if (cas_config_opid_index_less_or_equal && !tablet_already_deleted) {
     if (!consensus) {
       *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING;
       return Status::IllegalState("Raft Consensus not available. Tablet shutting down");
@@ -707,8 +707,9 @@ Status TSTabletManager::DeleteTablet(
 
   replica->SetStatusMessage("Deleted tablet blocks from disk");
 
-  // We only remove DELETED tablets from the tablet map.
+  // Only DELETED tablets are fully shut down and removed from the tablet map.
   if (delete_type == TABLET_DATA_DELETED) {
+    replica->Shutdown();
     std::lock_guard<rw_spinlock> lock(lock_);
     RETURN_NOT_OK(CheckRunningUnlocked(error_code));
     CHECK_EQ(1, tablet_map_.erase(tablet_id)) << tablet_id;


[2/3] kudu git commit: consensus: Save previous last-logged OpId across tablet copies

Posted by da...@apache.org.
consensus: Save previous last-logged OpId across tablet copies

When a replica falls behind the leader's log, and then fails a tablet
copy, it should still be able to vote in leader elections while
tombstoned. With this patch, that becomes possible. This helps with
availability.

Without this patch, asking a post-tablet copy failure replica to vote
would result in the following RPC response:

IllegalState: must be running to vote when last-logged opid is not known

Change-Id: Ie886764adbdc6d84406df44d6cdb693d038bcd57
Reviewed-on: http://gerrit.cloudera.org:8080/7888
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <as...@cloudera.com>
Reviewed-by: David Ribeiro Alves <da...@gmail.com>
Reviewed-on: http://gerrit.cloudera.org:8080/7912
Reviewed-by: Dan Burkert <da...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/9d71f99a
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/9d71f99a
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/9d71f99a

Branch: refs/heads/branch-1.5.x
Commit: 9d71f99a9c54e20ff5ab76d91ad6e41b05fc5467
Parents: 0dbc67c
Author: Mike Percy <mp...@apache.org>
Authored: Tue Aug 29 14:20:24 2017 -0700
Committer: Dan Burkert <da...@apache.org>
Committed: Thu Aug 31 04:01:39 2017 +0000

----------------------------------------------------------------------
 .../integration-tests/raft_consensus-itest.cc   | 76 +++++++++++++++++---
 src/kudu/tablet/tablet_metadata.cc              |  4 +-
 src/kudu/tserver/tablet_copy_client.cc          |  9 +++
 src/kudu/tserver/ts_tablet_manager.cc           |  3 +-
 src/kudu/tserver/ts_tablet_manager.h            |  2 +-
 5 files changed, 81 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/9d71f99a/src/kudu/integration-tests/raft_consensus-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest.cc b/src/kudu/integration-tests/raft_consensus-itest.cc
index 91fa7b4..8e4cf8a 100644
--- a/src/kudu/integration-tests/raft_consensus-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest.cc
@@ -141,6 +141,7 @@ using std::unordered_map;
 using std::unordered_set;
 using std::vector;
 using strings::Substitute;
+using tablet::TABLET_DATA_COPYING;
 
 static const int kConsensusRpcTimeoutForTests = 50;
 
@@ -446,6 +447,8 @@ void RaftConsensusITest::AddFlagsForLogRolls(vector<string>* extra_tserver_flags
   extra_tserver_flags->push_back("--log_max_segments_to_retain=3");
   extra_tserver_flags->push_back("--maintenance_manager_polling_interval_ms=100");
   extra_tserver_flags->push_back("--log_target_replay_size_mb=1");
+  // We write 128KB cells in CauseFollowerToFallBehindLogGC(): bump the limit.
+  extra_tserver_flags->push_back("--max_cell_size_bytes=1000000");
 }
 
 // Test that we can retrieve the permanent uuid of a server running
@@ -831,8 +834,6 @@ TEST_F(RaftConsensusITest, TestFollowerFallsBehindLeaderGC) {
   vector<string> ts_flags = {
     // Disable follower eviction to maintain the original intent of this test.
     "--evict_failed_followers=false",
-    // We write 128KB cells in this test, so bump the limit.
-    "--max_cell_size_bytes=1000000"
   };
   AddFlagsForLogRolls(&ts_flags); // For CauseFollowerToFallBehindLogGC().
   NO_FATALS(BuildAndStart(ts_flags));
@@ -2813,10 +2814,7 @@ TEST_F(RaftConsensusITest, TestHammerOneRow) {
 // Test that followers that fall behind the leader's log GC threshold are
 // evicted from the config.
 TEST_F(RaftConsensusITest, TestEvictAbandonedFollowers) {
-  vector<string> ts_flags = {
-    // We write 128KB cells in this test, so bump the limit.
-    "--max_cell_size_bytes=1000000"
-  };
+  vector<string> ts_flags;
   AddFlagsForLogRolls(&ts_flags); // For CauseFollowerToFallBehindLogGC().
   vector<string> master_flags = {
     "--master_add_server_when_underreplicated=false",
@@ -2840,13 +2838,71 @@ TEST_F(RaftConsensusITest, TestEvictAbandonedFollowers) {
   ASSERT_OK(WaitForServersToAgree(timeout, active_tablet_servers, tablet_id_, 2));
 }
 
+// Have a replica fall behind the leader's log, then fail a tablet copy. It
+// should still be able to vote in leader elections.
+TEST_F(RaftConsensusITest, TestTombstonedVoteAfterFailedTabletCopy) {
+  const MonoDelta kTimeout = MonoDelta::FromSeconds(30);
+
+  vector<string> ts_flags;
+  AddFlagsForLogRolls(&ts_flags); // For CauseFollowerToFallBehindLogGC().
+  const vector<string> kMasterFlags = {
+    "--master_add_server_when_underreplicated=false",
+  };
+  NO_FATALS(BuildAndStart(ts_flags, kMasterFlags));
+
+  TabletServerMap active_tablet_servers = tablet_servers_;
+  ASSERT_EQ(3, FLAGS_num_replicas);
+  ASSERT_EQ(3, active_tablet_servers.size());
+
+  string leader_uuid;
+  int64_t orig_term;
+  string follower_uuid;
+  NO_FATALS(CauseFollowerToFallBehindLogGC(&leader_uuid, &orig_term, &follower_uuid));
+
+  // Wait for the abandoned follower to be evicted.
+  ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(2, tablet_servers_[leader_uuid],
+                                                tablet_id_, kTimeout));
+  ASSERT_EQ(1, active_tablet_servers.erase(follower_uuid));
+  ASSERT_OK(WaitForServersToAgree(kTimeout, active_tablet_servers, tablet_id_, 2));
+
+  // Now the follower is evicted and will be tombstoned.
+  // We'll add it back to the config and then crash the follower during the
+  // resulting tablet copy.
+  ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server_by_uuid(follower_uuid),
+            "tablet_copy_fault_crash_on_fetch_all", "1.0"));
+  auto* leader_ts = tablet_servers_[leader_uuid];
+  auto* follower_ts = tablet_servers_[follower_uuid];
+  ASSERT_OK(itest::AddServer(leader_ts, tablet_id_, follower_ts, RaftPeerPB::VOTER,
+                             boost::none, kTimeout));
+  ASSERT_OK(cluster_->tablet_server_by_uuid(follower_uuid)->WaitForInjectedCrash(kTimeout));
+
+  // Shut down the rest of the cluster, then only bring back the node we
+  // crashed, plus one other node. The tombstoned replica should still be able
+  // to vote and the tablet should come online.
+  cluster_->Shutdown();
+
+  int follower_idx = cluster_->tablet_server_index_by_uuid(follower_uuid);
+  ASSERT_OK(inspect_->CheckTabletDataStateOnTS(follower_idx, tablet_id_, { TABLET_DATA_COPYING }));
+
+  ASSERT_OK(cluster_->master()->Restart());
+  ASSERT_OK(cluster_->tablet_server_by_uuid(leader_uuid)->Restart());
+  ASSERT_OK(cluster_->tablet_server_by_uuid(follower_uuid)->Restart());
+
+  TestWorkload workload(cluster_.get());
+  workload.set_table_name(kTableId);
+  workload.set_timeout_allowed(true);
+  workload.Setup();
+  workload.Start();
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_GE(workload.rows_inserted(), 100);
+  });
+  workload.StopAndJoin();
+}
+
 // Test that, after followers are evicted from the config, the master re-adds a new
 // replica for that follower and it eventually catches back up.
 TEST_F(RaftConsensusITest, TestMasterReplacesEvictedFollowers) {
-  vector<string> ts_flags = {
-    // We write 128KB cells in this test, so bump the limit.
-    "--max_cell_size_bytes=1000000"
-  };
+  vector<string> ts_flags;
   AddFlagsForLogRolls(&ts_flags); // For CauseFollowerToFallBehindLogGC().
   NO_FATALS(BuildAndStart(ts_flags));
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/9d71f99a/src/kudu/tablet/tablet_metadata.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_metadata.cc b/src/kudu/tablet/tablet_metadata.cc
index 4ed5352..d5a271f 100644
--- a/src/kudu/tablet/tablet_metadata.cc
+++ b/src/kudu/tablet/tablet_metadata.cc
@@ -203,7 +203,9 @@ Status TabletMetadata::DeleteTabletData(TabletDataState delete_type,
     }
     rowsets_.clear();
     tablet_data_state_ = delete_type;
-    tombstone_last_logged_opid_ = last_logged_opid;
+    if (last_logged_opid) {
+      tombstone_last_logged_opid_ = last_logged_opid;
+    }
   }
 
   // Keep a copy of the old data dir group in case of flush failure.

http://git-wip-us.apache.org/repos/asf/kudu/blob/9d71f99a/src/kudu/tserver/tablet_copy_client.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_copy_client.cc b/src/kudu/tserver/tablet_copy_client.cc
index 2b894fe..6d1731e 100644
--- a/src/kudu/tserver/tablet_copy_client.cc
+++ b/src/kudu/tserver/tablet_copy_client.cc
@@ -53,6 +53,7 @@
 #include "kudu/util/crc.h"
 #include "kudu/util/env.h"
 #include "kudu/util/env_util.h"
+#include "kudu/util/fault_injection.h"
 #include "kudu/util/flag_tags.h"
 #include "kudu/util/logging.h"
 #include "kudu/util/monotime.h"
@@ -78,6 +79,12 @@ DEFINE_int32(tablet_copy_dowload_file_inject_latency_ms, 0,
              "to take much longer. For use in tests only.");
 TAG_FLAG(tablet_copy_dowload_file_inject_latency_ms, hidden);
 
+DEFINE_double(tablet_copy_fault_crash_on_fetch_all, 0.0,
+              "Fraction of the time that the server will crash when FetchAll() "
+              "is called on the TabletCopyClient. (For testing only!)");
+TAG_FLAG(tablet_copy_fault_crash_on_fetch_all, unsafe);
+TAG_FLAG(tablet_copy_fault_crash_on_fetch_all, runtime);
+
 DECLARE_int32(tablet_copy_transfer_chunk_size_bytes);
 
 // RETURN_NOT_OK_PREPEND() with a remote-error unwinding step.
@@ -310,6 +317,8 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr,
 Status TabletCopyClient::FetchAll(const scoped_refptr<TabletReplica>& tablet_replica) {
   CHECK_EQ(kStarted, state_);
 
+  MAYBE_FAULT(FLAGS_tablet_copy_fault_crash_on_fetch_all);
+
   tablet_replica_ = tablet_replica;
 
   // Download all the files (serially, for now, but in parallel in the future).

http://git-wip-us.apache.org/repos/asf/kudu/blob/9d71f99a/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index da40445..276c1f8 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -1091,7 +1091,7 @@ Status TSTabletManager::DeleteTabletData(
     const scoped_refptr<TabletMetadata>& meta,
     const scoped_refptr<consensus::ConsensusMetadataManager>& cmeta_manager,
     TabletDataState delete_type,
-    const boost::optional<OpId>& last_logged_opid) {
+    boost::optional<OpId> last_logged_opid) {
   const string& tablet_id = meta->tablet_id();
   LOG(INFO) << LogPrefix(tablet_id, meta->fs_manager())
             << "Deleting tablet data with delete state "
@@ -1105,6 +1105,7 @@ Status TSTabletManager::DeleteTabletData(
   // Note: Passing an unset 'last_logged_opid' will retain the last_logged_opid
   // that was previously in the metadata.
   RETURN_NOT_OK(meta->DeleteTabletData(delete_type, last_logged_opid));
+  last_logged_opid = meta->tombstone_last_logged_opid();
   LOG(INFO) << LogPrefix(tablet_id, meta->fs_manager())
             << "tablet deleted: last-logged OpId: "
             << (last_logged_opid ? OpIdToString(*last_logged_opid) : "(unknown)");

http://git-wip-us.apache.org/repos/asf/kudu/blob/9d71f99a/src/kudu/tserver/ts_tablet_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.h b/src/kudu/tserver/ts_tablet_manager.h
index 7257d84..86ccd06 100644
--- a/src/kudu/tserver/ts_tablet_manager.h
+++ b/src/kudu/tserver/ts_tablet_manager.h
@@ -188,7 +188,7 @@ class TSTabletManager : public tserver::TabletReplicaLookupIf {
       const scoped_refptr<tablet::TabletMetadata>& meta,
       const scoped_refptr<consensus::ConsensusMetadataManager>& cmeta_manager,
       tablet::TabletDataState delete_type,
-      const boost::optional<consensus::OpId>& last_logged_opid);
+      boost::optional<consensus::OpId> last_logged_opid);
 
   // Forces shutdown of the tablet replicas in the data dir corresponding to 'uuid'.
   void FailTabletsInDataDir(const std::string& uuid);


[3/3] kudu git commit: tablet copy: Allow voting from failed initial tablet copies

Posted by da...@apache.org.
tablet copy: Allow voting from failed initial tablet copies

This patch changes the behavior of tablets being copied for the first
time, i.e. to a new tablet server that has never hosted that replica. In
that case, if the tablet copy fails, with this patch that replica will
still be allowed to vote while tombstoned. This improves availability.
Without this patch, if a tablet copy to a new replica fails, that
replica will not be able to vote while tombstoned because its
last-logged OpId field in its superblock will be unset.
This is achieved within tablet copy by setting the initial
tombstoned_last_logged_opid to (1,0).

This patch is a bit of a hack, since it takes advantage of the fact that
it is impossible to have a "real" OpId of 1.0 in the current
implementation (1.1 is the operational minimum) so having a last-logged
OpId of 1.0 represents having never logged any actual ops.

Change-Id: Ibaad2f56797db731a0f701bbd37e3e450bc17b51
Reviewed-on: http://gerrit.cloudera.org:8080/7904
Reviewed-by: Alexey Serbin <as...@cloudera.com>
Tested-by: Kudu Jenkins
Reviewed-on: http://gerrit.cloudera.org:8080/7913
Reviewed-by: Dan Burkert <da...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/2de07e59
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/2de07e59
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/2de07e59

Branch: refs/heads/branch-1.5.x
Commit: 2de07e5940377e39947e11daa2442c94b4f8dbbe
Parents: 9d71f99
Author: Mike Percy <mp...@apache.org>
Authored: Tue Aug 29 19:23:34 2017 -0700
Committer: Dan Burkert <da...@apache.org>
Committed: Thu Aug 31 04:01:50 2017 +0000

----------------------------------------------------------------------
 src/kudu/integration-tests/tablet_copy-itest.cc | 78 ++++++++++++++++++++
 src/kudu/tserver/tablet_copy_client.cc          |  7 ++
 2 files changed, 85 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/2de07e59/src/kudu/integration-tests/tablet_copy-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/tablet_copy-itest.cc b/src/kudu/integration-tests/tablet_copy-itest.cc
index 52df993..de492ae 100644
--- a/src/kudu/integration-tests/tablet_copy-itest.cc
+++ b/src/kudu/integration-tests/tablet_copy-itest.cc
@@ -40,6 +40,7 @@
 #include "kudu/common/wire_protocol-test-util.h"
 #include "kudu/common/wire_protocol.h"
 #include "kudu/common/wire_protocol.pb.h"
+#include "kudu/consensus/consensus.pb.h"
 #include "kudu/consensus/consensus_meta_manager.h"
 #include "kudu/consensus/metadata.pb.h"
 #include "kudu/consensus/opid.pb.h"
@@ -55,6 +56,7 @@
 #include "kudu/integration-tests/external_mini_cluster.h"
 #include "kudu/integration-tests/external_mini_cluster_fs_inspector.h"
 #include "kudu/integration-tests/test_workload.h"
+#include "kudu/master/master.pb.h"
 #include "kudu/tablet/metadata.pb.h"
 #include "kudu/tablet/tablet.pb.h"
 #include "kudu/tablet/tablet_metadata.h"
@@ -87,9 +89,13 @@ DEFINE_int32(test_delete_leader_num_writer_threads, 1,
 using kudu::client::KuduSchema;
 using kudu::client::KuduSchemaFromSchema;
 using kudu::client::KuduTableCreator;
+using kudu::consensus::COMMITTED_OPID;
 using kudu::consensus::ConsensusMetadataManager;
+using kudu::consensus::ConsensusMetadataPB;
+using kudu::consensus::RaftPeerPB;
 using kudu::itest::TServerDetails;
 using kudu::itest::WaitForNumTabletServers;
+using kudu::tablet::TABLET_DATA_COPYING;
 using kudu::tablet::TABLET_DATA_DELETED;
 using kudu::tablet::TABLET_DATA_TOMBSTONED;
 using kudu::tserver::ListTabletsResponsePB;
@@ -1084,6 +1090,78 @@ TEST_F(TabletCopyITest, TestTabletCopyThrottling) {
   LOG(INFO) << "Number of in progress responses: " << num_inprogress;
 }
 
+// Test that a failed tablet copy of a brand-new replica results in still being
+// able to vote while tombstoned.
+TEST_F(TabletCopyITest, TestTabletCopyNewReplicaFailureCanVote) {
+  const MonoDelta kTimeout = MonoDelta::FromSeconds(30);
+  // We want all tablet copy attempts to fail after initial setup.
+  NO_FATALS(StartCluster({"--tablet_copy_early_session_timeout_prob=1.0"}, {},
+                         /*num_tablet_servers=*/ 4));
+  TestWorkload workload(cluster_.get());
+  workload.Setup();
+
+  ASSERT_OK(inspect_->WaitForReplicaCount(3));
+  master::GetTableLocationsResponsePB table_locations;
+  ASSERT_OK(itest::GetTableLocations(cluster_->master_proxy(), TestWorkload::kDefaultTableName,
+                                     kTimeout, &table_locations));
+  ASSERT_EQ(1, table_locations.tablet_locations_size());
+  string tablet_id = table_locations.tablet_locations(0).tablet_id();
+  set<string> replica_uuids;
+  for (const auto& replica : table_locations.tablet_locations(0).replicas()) {
+    replica_uuids.insert(replica.ts_info().permanent_uuid());
+  }
+
+  TServerDetails* leader_ts;
+  ASSERT_OK(itest::FindTabletLeader(ts_map_, tablet_id, kTimeout, &leader_ts));
+  ASSERT_OK(itest::WaitForOpFromCurrentTerm(leader_ts, tablet_id, COMMITTED_OPID, kTimeout));
+
+  string new_replica_uuid;
+  for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
+    if (!ContainsKey(replica_uuids, cluster_->tablet_server(i)->uuid())) {
+      new_replica_uuid = cluster_->tablet_server(i)->uuid();
+      break;
+    }
+  }
+  ASSERT_FALSE(new_replica_uuid.empty());
+  auto* new_replica_ts = ts_map_[new_replica_uuid];
+
+  // Adding a server will trigger a tablet copy.
+  ASSERT_OK(itest::AddServer(leader_ts, tablet_id, new_replica_ts, RaftPeerPB::VOTER,
+                             boost::none, kTimeout));
+
+  // Wait until the new replica has done its initial download, and has either
+  // failed or is about to fail (the check is nondeterministic) plus has
+  // persisted its cmeta.
+  int new_replica_idx = cluster_->tablet_server_index_by_uuid(new_replica_uuid);
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(inspect_->CheckTabletDataStateOnTS(new_replica_idx, tablet_id,
+                                                 { TABLET_DATA_COPYING, TABLET_DATA_TOMBSTONED }));
+    ConsensusMetadataPB cmeta_pb;
+    ASSERT_OK(inspect_->ReadConsensusMetadataOnTS(new_replica_idx, tablet_id, &cmeta_pb));
+  });
+
+  // Restart the cluster with 3 voters instead of 4, including the new replica
+  // that should be tombstoned. It should be able to vote and get the leader up
+  // and running.
+  cluster_->Shutdown();
+  for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
+    // Remove the "early timeout" flag.
+    cluster_->tablet_server(i)->mutable_flags()->pop_back();
+  }
+  ASSERT_OK(cluster_->master()->Restart());
+  ASSERT_OK(cluster_->tablet_server_by_uuid(new_replica_uuid)->Restart());
+  auto iter = replica_uuids.cbegin();
+  ASSERT_OK(cluster_->tablet_server_by_uuid(*iter)->Restart());
+  ASSERT_OK(cluster_->tablet_server_by_uuid(*++iter)->Restart());
+
+  // Now wait until we can write.
+  workload.Start();
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_GE(workload.rows_inserted(), 100);
+  });
+  workload.StopAndJoin();
+}
+
 // This test uses CountBlocksUnderManagement() which only works with the
 // LogBlockManager.
 #ifndef __APPLE__

http://git-wip-us.apache.org/repos/asf/kudu/blob/2de07e59/src/kudu/tserver/tablet_copy_client.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_copy_client.cc b/src/kudu/tserver/tablet_copy_client.cc
index 6d1731e..b10336d 100644
--- a/src/kudu/tserver/tablet_copy_client.cc
+++ b/src/kudu/tserver/tablet_copy_client.cc
@@ -34,6 +34,7 @@
 #include "kudu/consensus/consensus_meta_manager.h"
 #include "kudu/consensus/metadata.pb.h"
 #include "kudu/consensus/opid.pb.h"
+#include "kudu/consensus/opid_util.h"
 #include "kudu/fs/block_id.h"
 #include "kudu/fs/block_manager.h"
 #include "kudu/fs/data_dirs.h"
@@ -96,6 +97,7 @@ namespace tserver {
 
 using consensus::ConsensusMetadata;
 using consensus::ConsensusMetadataManager;
+using consensus::MakeOpId;
 using consensus::OpId;
 using env_util::CopyFile;
 using fs::BlockTransaction;
@@ -284,6 +286,11 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr,
         "Could not replace superblock with COPYING data state");
     CHECK_OK(fs_manager_->dd_manager()->CreateDataDirGroup(tablet_id_));
   } else {
+    // HACK: Set the initial tombstoned last-logged OpId to 1.0 when copying a
+    // replica for the first time, so that if the tablet copy fails, the
+    // tombstoned replica will still be able to vote.
+    // TODO(mpercy): Give this particular OpId a name.
+    *superblock_->mutable_tombstone_last_logged_opid() = MakeOpId(1, 0);
     Partition partition;
     Partition::FromPB(superblock_->partition(), &partition);
     PartitionSchema partition_schema;