You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2018/01/17 03:19:20 UTC

[2/2] kudu git commit: [catalog_manager] more info if unable to replace a replica

[catalog_manager] more info if unable to replace a replica

Output actionable warning message when the catalog manager is unable
to find a spot for a replacement replica.  Since the 3-4-3 replication
scheme is now enabled by default, this might be useful in case if
running a cluster with just 3 tablet servers when tables have
replication factor of 3.

Change-Id: Id5f562c6d1ff526daa785ea535e440598c03cd37
Reviewed-on: http://gerrit.cloudera.org:8080/9040
Reviewed-by: Mike Percy <mp...@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/c40e0587
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/c40e0587
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/c40e0587

Branch: refs/heads/master
Commit: c40e0587bf6a6aa55e5bd72dd2dd9356b1507f2e
Parents: 5806b80
Author: Alexey Serbin <as...@cloudera.com>
Authored: Tue Jan 16 15:01:34 2018 -0800
Committer: Alexey Serbin <as...@cloudera.com>
Committed: Wed Jan 17 03:18:11 2018 +0000

----------------------------------------------------------------------
 src/kudu/master/catalog_manager.cc | 38 ++++++++++++++++++++++++++++++---
 src/kudu/master/ts_manager.cc      |  4 ++--
 src/kudu/master/ts_manager.h       |  8 +++----
 3 files changed, 41 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/c40e0587/src/kudu/master/catalog_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc
index d6b369b..1248144 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -3133,11 +3133,43 @@ bool AsyncAddReplicaTask::SendRequest(int attempt) {
       InsertOrDie(&excluded, ts_desc);
     }
   }
+
   auto replacement_replica = SelectReplica(ts_descs, excluded, rng_);
   if (PREDICT_FALSE(!replacement_replica)) {
-    KLOG_EVERY_N(WARNING, 100) << LogPrefix()
-                               << "No candidate replacement replica found for tablet "
-                               << tablet_->ToString();
+    auto msg = Substitute("no candidate replacement replica found for tablet $0",
+                          tablet_->ToString());
+    // Check whether it's a situation when a replacement replica cannot be found
+    // due to an inconsistency in cluster configuration. If the tablet has the
+    // replication factor of N, and the cluster is configured to use N->(N+1)->N
+    // replication scheme (see --raft_prepare_replacement_before_eviction flag),
+    // at least N+1 tablet servers should be registered to find a place
+    // for a replacement replica.
+    TSDescriptorVector all_descriptors;
+    master_->ts_manager()->GetAllDescriptors(&all_descriptors);
+    const auto num_tservers_registered = all_descriptors.size();
+
+    auto replication_factor = 0;
+    {
+      TableMetadataLock l(tablet_->table().get(), LockMode::READ);
+      replication_factor = tablet_->table()->metadata().state().pb.num_replicas();
+    }
+    DCHECK_GE(replication_factor, 0);
+    const auto num_tservers_needed =
+        FLAGS_raft_prepare_replacement_before_eviction ? replication_factor + 1
+                                                       : replication_factor;
+    if (num_tservers_registered < num_tservers_needed) {
+      msg += Substitute(
+          "; the total number of registered tablet servers ($0) does not allow "
+          "for replacement of the failed replica: at least $1 tablet servers "
+          "are required", num_tservers_registered, num_tservers_needed);
+      if (FLAGS_raft_prepare_replacement_before_eviction &&
+          num_tservers_registered == replication_factor) {
+        msg +=
+          "; consider either adding an additional tablet server or running "
+          "the cluster with --raft_prepare_replacement_before_eviction=false";
+      }
+    }
+    KLOG_EVERY_N_SECS(WARNING, 60) << LogPrefix() << msg;
     return false;
   }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/c40e0587/src/kudu/master/ts_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/master/ts_manager.cc b/src/kudu/master/ts_manager.cc
index 65faad8..8b1aa1c 100644
--- a/src/kudu/master/ts_manager.cc
+++ b/src/kudu/master/ts_manager.cc
@@ -92,13 +92,13 @@ Status TSManager::RegisterTS(const NodeInstancePB& instance,
   return Status::OK();
 }
 
-void TSManager::GetAllDescriptors(vector<shared_ptr<TSDescriptor> > *descs) const {
+void TSManager::GetAllDescriptors(vector<shared_ptr<TSDescriptor>> *descs) const {
   descs->clear();
   shared_lock<rw_spinlock> l(lock_);
   AppendValuesFromMap(servers_by_id_, descs);
 }
 
-void TSManager::GetAllLiveDescriptors(vector<shared_ptr<TSDescriptor> > *descs) const {
+void TSManager::GetAllLiveDescriptors(vector<shared_ptr<TSDescriptor>> *descs) const {
   descs->clear();
 
   shared_lock<rw_spinlock> l(lock_);

http://git-wip-us.apache.org/repos/asf/kudu/blob/c40e0587/src/kudu/master/ts_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/master/ts_manager.h b/src/kudu/master/ts_manager.h
index ecdd5cd..8327203 100644
--- a/src/kudu/master/ts_manager.h
+++ b/src/kudu/master/ts_manager.h
@@ -35,7 +35,7 @@ namespace master {
 
 class TSDescriptor;
 
-typedef std::vector<std::shared_ptr<TSDescriptor> > TSDescriptorVector;
+typedef std::vector<std::shared_ptr<TSDescriptor>> TSDescriptorVector;
 
 // Tracks the servers that the master has heard from, along with their
 // last heartbeat, etc.
@@ -74,11 +74,11 @@ class TSManager {
 
   // Return all of the currently registered TS descriptors into the provided
   // list.
-  void GetAllDescriptors(std::vector<std::shared_ptr<TSDescriptor> >* descs) const;
+  void GetAllDescriptors(std::vector<std::shared_ptr<TSDescriptor>>* descs) const;
 
   // Return all of the currently registered TS descriptors that have sent a
   // heartbeat recently, indicating that they're alive and well.
-  void GetAllLiveDescriptors(std::vector<std::shared_ptr<TSDescriptor> >* descs) const;
+  void GetAllLiveDescriptors(std::vector<std::shared_ptr<TSDescriptor>>* descs) const;
 
   // Get the TS count.
   int GetCount() const;
@@ -87,7 +87,7 @@ class TSManager {
   mutable rw_spinlock lock_;
 
   typedef std::unordered_map<
-    std::string, std::shared_ptr<TSDescriptor> > TSDescriptorMap;
+    std::string, std::shared_ptr<TSDescriptor>> TSDescriptorMap;
   TSDescriptorMap servers_by_id_;
 
   DISALLOW_COPY_AND_ASSIGN(TSManager);