You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by jd...@apache.org on 2016/02/06 19:42:09 UTC

[2/3] incubator-kudu git commit: KUDU-1278 - Mark peer responsive when remote bootstrap is in progress

KUDU-1278 - Mark peer responsive when remote bootstrap is in progress

This commit fixes a bug in remote bootstrap where, when a peer starts to remote bootstrap
its last_successful_communication_time is not updated, and so if the remote bootstrap lasts
more than peer timeout time, this peer will be evicted and leader will try to remote bootstrap
on a new peer, which will likely timeout again.
Also during remote bootstrap lots of logs are printed, so some log levels are changed from
INFO to VLOG1.

Change-Id: Ic41f2ad31c437f6ab976987262d8b35f9aa9f3ad
Reviewed-on: http://gerrit.cloudera.org:8080/1759
Reviewed-by: Todd Lipcon <to...@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/b8415129
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/b8415129
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/b8415129

Branch: refs/heads/master
Commit: b8415129204d26508b5e45b57dce020280bdbc69
Parents: 4a9e2ca
Author: Binglin Chang <de...@gmail.com>
Authored: Sat Jan 9 22:42:37 2016 +0800
Committer: David Ribeiro Alves <da...@cloudera.com>
Committed: Fri Feb 5 23:32:54 2016 +0000

----------------------------------------------------------------------
 src/kudu/consensus/consensus_peers.cc                | 13 +++++++++----
 src/kudu/consensus/consensus_queue.cc                |  4 ++--
 src/kudu/integration-tests/remote_bootstrap-itest.cc |  2 +-
 src/kudu/master/catalog_manager.cc                   |  4 +++-
 src/kudu/master/catalog_manager.h                    |  4 +++-
 src/kudu/tserver/remote_bootstrap_session.cc         |  5 ++++-
 src/kudu/tserver/tablet_peer_lookup.h                |  5 ++++-
 src/kudu/tserver/tablet_service.cc                   |  5 +++--
 src/kudu/tserver/ts_tablet_manager.cc                | 12 +++++++++---
 src/kudu/tserver/ts_tablet_manager.h                 |  4 +++-
 src/kudu/tserver/tserver.proto                       |  3 +++
 11 files changed, 44 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/b8415129/src/kudu/consensus/consensus_peers.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_peers.cc b/src/kudu/consensus/consensus_peers.cc
index c27ae83..37e8cb8 100644
--- a/src/kudu/consensus/consensus_peers.cc
+++ b/src/kudu/consensus/consensus_peers.cc
@@ -301,10 +301,15 @@ Status Peer::SendRemoteBootstrapRequest() {
 }
 
 void Peer::ProcessRemoteBootstrapResponse() {
-  // We treat remote bootstrap as fire-and-forget.
-  if (rb_response_.has_error()) {
-    LOG_WITH_PREFIX_UNLOCKED(WARNING) << "Unable to begin remote bootstrap on peer: "
-                                      << rb_response_.ShortDebugString();
+  if (controller_.status().ok() && rb_response_.has_error()) {
+    // ALREADY_INPROGRESS is expected, so we do not log this error.
+    if (rb_response_.error().code() ==
+        kudu::tserver::TabletServerErrorPB::TabletServerErrorPB::ALREADY_INPROGRESS) {
+      queue_->NotifyPeerIsResponsiveDespiteError(peer_pb_.permanent_uuid());
+    } else {
+      LOG_WITH_PREFIX_UNLOCKED(WARNING) << "Unable to begin remote bootstrap on peer: "
+                                        << rb_response_.ShortDebugString();
+    }
   }
   sem_.Release();
 }

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/b8415129/src/kudu/consensus/consensus_queue.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_queue.cc b/src/kudu/consensus/consensus_queue.cc
index a36d7fe..040a5c2 100644
--- a/src/kudu/consensus/consensus_queue.cc
+++ b/src/kudu/consensus/consensus_queue.cc
@@ -313,7 +313,7 @@ Status PeerMessageQueue::RequestForPeer(const string& uuid,
   }
 
   if (PREDICT_FALSE(peer->needs_remote_bootstrap)) {
-    LOG_WITH_PREFIX_UNLOCKED(INFO) << "Peer needs remote bootstrap: " << peer->ToString();
+    VLOG_WITH_PREFIX_UNLOCKED(1) << "Peer needs remote bootstrap: " << peer->ToString();
     *needs_remote_bootstrap = true;
     return Status::OK();
   }
@@ -499,7 +499,7 @@ void PeerMessageQueue::ResponseFromPeer(const std::string& peer_uuid,
           << response.ShortDebugString();
 
       peer->needs_remote_bootstrap = true;
-      LOG_WITH_PREFIX_UNLOCKED(INFO) << "Marked peer as needing remote bootstrap: "
+      VLOG_WITH_PREFIX_UNLOCKED(1) << "Marked peer as needing remote bootstrap: "
                                      << peer->ToString();
       *more_pending = true;
       return;

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/b8415129/src/kudu/integration-tests/remote_bootstrap-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/remote_bootstrap-itest.cc b/src/kudu/integration-tests/remote_bootstrap-itest.cc
index 97df129..305d2c8 100644
--- a/src/kudu/integration-tests/remote_bootstrap-itest.cc
+++ b/src/kudu/integration-tests/remote_bootstrap-itest.cc
@@ -685,7 +685,7 @@ TEST_F(RemoteBootstrapITest, TestDisableRemoteBootstrap_NoTightLoopWhenTabletDel
 
 // Test that if a remote bootstrap is taking a long time but the client peer is still responsive,
 // the leader won't mark it as failed.
-TEST_F(RemoteBootstrapITest, DISABLED_TestSlowBootstrapDoesntFail) {
+TEST_F(RemoteBootstrapITest, TestSlowBootstrapDoesntFail) {
   MonoDelta timeout = MonoDelta::FromSeconds(10);
   vector<string> ts_flags, master_flags;
   ts_flags.push_back("--enable_leader_failure_detection=false");

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/b8415129/src/kudu/master/catalog_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc
index 9a9344a..000a6a5 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -1701,7 +1701,9 @@ const NodeInstancePB& CatalogManager::NodeInstance() const {
   return master_->instance_pb();
 }
 
-Status CatalogManager::StartRemoteBootstrap(const StartRemoteBootstrapRequestPB& req) {
+Status CatalogManager::StartRemoteBootstrap(
+    const StartRemoteBootstrapRequestPB& req,
+    boost::optional<kudu::tserver::TabletServerErrorPB::Code>* error_code) {
   return Status::NotSupported("Remote bootstrap not yet implemented for the master tablet");
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/b8415129/src/kudu/master/catalog_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/master/catalog_manager.h b/src/kudu/master/catalog_manager.h
index 2365361..b06cb5a 100644
--- a/src/kudu/master/catalog_manager.h
+++ b/src/kudu/master/catalog_manager.h
@@ -401,7 +401,9 @@ class CatalogManager : public tserver::TabletPeerLookupIf {
 
   bool IsInitialized() const;
 
-  virtual Status StartRemoteBootstrap(const consensus::StartRemoteBootstrapRequestPB& req) OVERRIDE;
+  virtual Status StartRemoteBootstrap(
+      const consensus::StartRemoteBootstrapRequestPB& req,
+      boost::optional<kudu::tserver::TabletServerErrorPB::Code>* error_code) OVERRIDE;
 
   // Return OK if this CatalogManager is a leader in a consensus configuration and if
   // the required leader state (metadata for tables and tablets) has

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/b8415129/src/kudu/tserver/remote_bootstrap_session.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/remote_bootstrap_session.cc b/src/kudu/tserver/remote_bootstrap_session.cc
index c0c6aa1..2d2aced 100644
--- a/src/kudu/tserver/remote_bootstrap_session.cc
+++ b/src/kudu/tserver/remote_bootstrap_session.cc
@@ -90,8 +90,11 @@ Status RemoteBootstrapSession::Init() {
   // All subsequent requests should reuse the opened blocks.
   vector<BlockIdPB> data_blocks;
   TabletMetadata::CollectBlockIdPBs(tablet_superblock_, &data_blocks);
+  LOG(INFO) << "T " << tablet_peer_->tablet_id()
+            << " P " << tablet_peer_->consensus()->peer_uuid()
+            << ": Remote bootstrap: Opening " << data_blocks.size() << " blocks";
   for (const BlockIdPB& block_id : data_blocks) {
-    LOG(INFO) << "Opening block " << block_id.DebugString();
+    VLOG(1) << "Opening block " << block_id.DebugString();
     RETURN_NOT_OK(OpenBlockUnlocked(BlockId::FromPB(block_id)));
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/b8415129/src/kudu/tserver/tablet_peer_lookup.h
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_peer_lookup.h b/src/kudu/tserver/tablet_peer_lookup.h
index ceb087c..c58b79d 100644
--- a/src/kudu/tserver/tablet_peer_lookup.h
+++ b/src/kudu/tserver/tablet_peer_lookup.h
@@ -17,10 +17,12 @@
 #ifndef KUDU_TSERVER_TABLET_PEER_LOOKUP_H_
 #define KUDU_TSERVER_TABLET_PEER_LOOKUP_H_
 
+#include <boost/optional/optional_fwd.hpp>
 #include <memory>
 #include <string>
 
 #include "kudu/gutil/ref_counted.h"
+#include "kudu/tserver/tserver.pb.h"
 #include "kudu/util/status.h"
 
 namespace kudu {
@@ -49,7 +51,8 @@ class TabletPeerLookupIf {
 
   virtual const NodeInstancePB& NodeInstance() const = 0;
 
-  virtual Status StartRemoteBootstrap(const consensus::StartRemoteBootstrapRequestPB& req) = 0;
+  virtual Status StartRemoteBootstrap(const consensus::StartRemoteBootstrapRequestPB& req,
+                                      boost::optional<TabletServerErrorPB::Code>* error_code) = 0;
 };
 
 } // namespace tserver

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/b8415129/src/kudu/tserver/tablet_service.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_service.cc b/src/kudu/tserver/tablet_service.cc
index 10d8f3b..8787c43 100644
--- a/src/kudu/tserver/tablet_service.cc
+++ b/src/kudu/tserver/tablet_service.cc
@@ -956,10 +956,11 @@ void ConsensusServiceImpl::StartRemoteBootstrap(const StartRemoteBootstrapReques
   if (!CheckUuidMatchOrRespond(tablet_manager_, "StartRemoteBootstrap", req, resp, context)) {
     return;
   }
-  Status s = tablet_manager_->StartRemoteBootstrap(*req);
+  boost::optional<TabletServerErrorPB::Code> error_code;
+  Status s = tablet_manager_->StartRemoteBootstrap(*req, &error_code);
   if (!s.ok()) {
     SetupErrorAndRespond(resp->mutable_error(), s,
-                         TabletServerErrorPB::UNKNOWN_ERROR,
+                         error_code.get_value_or(TabletServerErrorPB::UNKNOWN_ERROR),
                          context);
     return;
   }

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/b8415129/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index 2e89e89..a26699e 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -337,7 +337,9 @@ Status TSTabletManager::CheckLeaderTermNotLower(const string& tablet_id,
   return Status::OK();
 }
 
-Status TSTabletManager::StartRemoteBootstrap(const StartRemoteBootstrapRequestPB& req) {
+Status TSTabletManager::StartRemoteBootstrap(
+    const StartRemoteBootstrapRequestPB& req,
+    boost::optional<TabletServerErrorPB::Code>* error_code) {
   const string& tablet_id = req.tablet_id();
   const string& bootstrap_peer_uuid = req.bootstrap_peer_uuid();
   HostPort bootstrap_peer_addr;
@@ -356,8 +358,12 @@ Status TSTabletManager::StartRemoteBootstrap(const StartRemoteBootstrapRequestPB
       meta = old_tablet_peer->tablet_metadata();
       replacing_tablet = true;
     }
-    RETURN_NOT_OK(StartTabletStateTransitionUnlocked(tablet_id, "remote bootstrapping tablet",
-                                                     &deleter));
+    Status ret = StartTabletStateTransitionUnlocked(tablet_id, "remote bootstrapping tablet",
+                                                    &deleter);
+    if (!ret.ok()) {
+      *error_code = TabletServerErrorPB::ALREADY_INPROGRESS;
+      return ret;
+    }
   }
 
   if (replacing_tablet) {

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/b8415129/src/kudu/tserver/ts_tablet_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.h b/src/kudu/tserver/ts_tablet_manager.h
index 2d63142..7fdb335 100644
--- a/src/kudu/tserver/ts_tablet_manager.h
+++ b/src/kudu/tserver/ts_tablet_manager.h
@@ -145,7 +145,9 @@ class TSTabletManager : public tserver::TabletPeerLookupIf {
   // See the StartRemoteBootstrap() RPC declaration in consensus.proto for details.
   // Currently this runs the entire procedure synchronously.
   // TODO: KUDU-921: Run this procedure on a background thread.
-  virtual Status StartRemoteBootstrap(const consensus::StartRemoteBootstrapRequestPB& req) OVERRIDE;
+  virtual Status StartRemoteBootstrap(
+      const consensus::StartRemoteBootstrapRequestPB& req,
+      boost::optional<TabletServerErrorPB::Code>* error_code) OVERRIDE;
 
   // Generate an incremental tablet report.
   //

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/b8415129/src/kudu/tserver/tserver.proto
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tserver.proto b/src/kudu/tserver/tserver.proto
index 1762b22..5d7933e 100644
--- a/src/kudu/tserver/tserver.proto
+++ b/src/kudu/tserver/tserver.proto
@@ -85,6 +85,9 @@ message TabletServerErrorPB {
 
     // The compare-and-swap specified by an atomic RPC operation failed.
     CAS_FAILED = 17;
+
+    // The requested operation is already inprogress, e.g. RemoteBootstrap.
+    ALREADY_INPROGRESS = 18;
   }
 
   // The error code.