You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2018/01/17 03:19:20 UTC
[2/2] kudu git commit: [catalog_manager] more info if unable to
replace a replica
[catalog_manager] more info if unable to replace a replica
Output actionable warning message when the catalog manager is unable
to find a spot for a replacement replica. Since the 3-4-3 replication
scheme is now enabled by default, this might be useful in case if
running a cluster with just 3 tablet servers when tables have
replication factor of 3.
Change-Id: Id5f562c6d1ff526daa785ea535e440598c03cd37
Reviewed-on: http://gerrit.cloudera.org:8080/9040
Reviewed-by: Mike Percy <mp...@apache.org>
Tested-by: Kudu Jenkins
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/c40e0587
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/c40e0587
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/c40e0587
Branch: refs/heads/master
Commit: c40e0587bf6a6aa55e5bd72dd2dd9356b1507f2e
Parents: 5806b80
Author: Alexey Serbin <as...@cloudera.com>
Authored: Tue Jan 16 15:01:34 2018 -0800
Committer: Alexey Serbin <as...@cloudera.com>
Committed: Wed Jan 17 03:18:11 2018 +0000
----------------------------------------------------------------------
src/kudu/master/catalog_manager.cc | 38 ++++++++++++++++++++++++++++++---
src/kudu/master/ts_manager.cc | 4 ++--
src/kudu/master/ts_manager.h | 8 +++----
3 files changed, 41 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/c40e0587/src/kudu/master/catalog_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc
index d6b369b..1248144 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -3133,11 +3133,43 @@ bool AsyncAddReplicaTask::SendRequest(int attempt) {
InsertOrDie(&excluded, ts_desc);
}
}
+
auto replacement_replica = SelectReplica(ts_descs, excluded, rng_);
if (PREDICT_FALSE(!replacement_replica)) {
- KLOG_EVERY_N(WARNING, 100) << LogPrefix()
- << "No candidate replacement replica found for tablet "
- << tablet_->ToString();
+ auto msg = Substitute("no candidate replacement replica found for tablet $0",
+ tablet_->ToString());
+ // Check whether it's a situation when a replacement replica cannot be found
+ // due to an inconsistency in cluster configuration. If the tablet has the
+ // replication factor of N, and the cluster is configured to use N->(N+1)->N
+ // replication scheme (see --raft_prepare_replacement_before_eviction flag),
+ // at least N+1 tablet servers should be registered to find a place
+ // for a replacement replica.
+ TSDescriptorVector all_descriptors;
+ master_->ts_manager()->GetAllDescriptors(&all_descriptors);
+ const auto num_tservers_registered = all_descriptors.size();
+
+ auto replication_factor = 0;
+ {
+ TableMetadataLock l(tablet_->table().get(), LockMode::READ);
+ replication_factor = tablet_->table()->metadata().state().pb.num_replicas();
+ }
+ DCHECK_GE(replication_factor, 0);
+ const auto num_tservers_needed =
+ FLAGS_raft_prepare_replacement_before_eviction ? replication_factor + 1
+ : replication_factor;
+ if (num_tservers_registered < num_tservers_needed) {
+ msg += Substitute(
+ "; the total number of registered tablet servers ($0) does not allow "
+ "for replacement of the failed replica: at least $1 tablet servers "
+ "are required", num_tservers_registered, num_tservers_needed);
+ if (FLAGS_raft_prepare_replacement_before_eviction &&
+ num_tservers_registered == replication_factor) {
+ msg +=
+ "; consider either adding an additional tablet server or running "
+ "the cluster with --raft_prepare_replacement_before_eviction=false";
+ }
+ }
+ KLOG_EVERY_N_SECS(WARNING, 60) << LogPrefix() << msg;
return false;
}
http://git-wip-us.apache.org/repos/asf/kudu/blob/c40e0587/src/kudu/master/ts_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/master/ts_manager.cc b/src/kudu/master/ts_manager.cc
index 65faad8..8b1aa1c 100644
--- a/src/kudu/master/ts_manager.cc
+++ b/src/kudu/master/ts_manager.cc
@@ -92,13 +92,13 @@ Status TSManager::RegisterTS(const NodeInstancePB& instance,
return Status::OK();
}
-void TSManager::GetAllDescriptors(vector<shared_ptr<TSDescriptor> > *descs) const {
+void TSManager::GetAllDescriptors(vector<shared_ptr<TSDescriptor>> *descs) const {
descs->clear();
shared_lock<rw_spinlock> l(lock_);
AppendValuesFromMap(servers_by_id_, descs);
}
-void TSManager::GetAllLiveDescriptors(vector<shared_ptr<TSDescriptor> > *descs) const {
+void TSManager::GetAllLiveDescriptors(vector<shared_ptr<TSDescriptor>> *descs) const {
descs->clear();
shared_lock<rw_spinlock> l(lock_);
http://git-wip-us.apache.org/repos/asf/kudu/blob/c40e0587/src/kudu/master/ts_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/master/ts_manager.h b/src/kudu/master/ts_manager.h
index ecdd5cd..8327203 100644
--- a/src/kudu/master/ts_manager.h
+++ b/src/kudu/master/ts_manager.h
@@ -35,7 +35,7 @@ namespace master {
class TSDescriptor;
-typedef std::vector<std::shared_ptr<TSDescriptor> > TSDescriptorVector;
+typedef std::vector<std::shared_ptr<TSDescriptor>> TSDescriptorVector;
// Tracks the servers that the master has heard from, along with their
// last heartbeat, etc.
@@ -74,11 +74,11 @@ class TSManager {
// Return all of the currently registered TS descriptors into the provided
// list.
- void GetAllDescriptors(std::vector<std::shared_ptr<TSDescriptor> >* descs) const;
+ void GetAllDescriptors(std::vector<std::shared_ptr<TSDescriptor>>* descs) const;
// Return all of the currently registered TS descriptors that have sent a
// heartbeat recently, indicating that they're alive and well.
- void GetAllLiveDescriptors(std::vector<std::shared_ptr<TSDescriptor> >* descs) const;
+ void GetAllLiveDescriptors(std::vector<std::shared_ptr<TSDescriptor>>* descs) const;
// Get the TS count.
int GetCount() const;
@@ -87,7 +87,7 @@ class TSManager {
mutable rw_spinlock lock_;
typedef std::unordered_map<
- std::string, std::shared_ptr<TSDescriptor> > TSDescriptorMap;
+ std::string, std::shared_ptr<TSDescriptor>> TSDescriptorMap;
TSDescriptorMap servers_by_id_;
DISALLOW_COPY_AND_ASSIGN(TSManager);