You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2017/12/02 05:28:41 UTC

[3/3] kudu git commit: KUDU-2229. consensus: Leader should not start FD

KUDU-2229. consensus: Leader should not start FD

This patch fixes a log spam issue caused by commit
28a671365d6d38da966481daf937b3776e3d4852 that erroneously enables the
failure detector on the leader when a configuration takes place. That
bug had no consensus safety implications, but it did generate a large
volume of confusing log messages in a neverending loop.

This patch also includes some minor updates to InternalMiniCluster for
convenience purposes.

Change-Id: Ie2ec9c5499e8e4c1659333bd53dd2d7f6dae81f5
Reviewed-on: http://gerrit.cloudera.org:8080/8711
Reviewed-by: Alexey Serbin <as...@cloudera.com>
Tested-by: Kudu Jenkins
Reviewed-on: http://gerrit.cloudera.org:8080/8732


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/d9eb7d4c
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/d9eb7d4c
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/d9eb7d4c

Branch: refs/heads/branch-1.6.x
Commit: d9eb7d4cb696cb460a988cda8d36e5719f4c283c
Parents: 01876d0
Author: Mike Percy <mp...@apache.org>
Authored: Thu Nov 30 19:40:23 2017 -0800
Committer: Mike Percy <mp...@apache.org>
Committed: Sat Dec 2 05:28:19 2017 +0000

----------------------------------------------------------------------
 src/kudu/consensus/raft_consensus.cc            |  21 ++-
 src/kudu/consensus/raft_consensus.h             |  19 ++-
 src/kudu/integration-tests/CMakeLists.txt       |   3 +-
 ...raft_consensus_failure_detector-imc-itest.cc | 144 +++++++++++++++++++
 src/kudu/mini-cluster/external_mini_cluster.h   |   4 +-
 src/kudu/mini-cluster/internal_mini_cluster.cc  |  18 +++
 src/kudu/mini-cluster/internal_mini_cluster.h   |   5 +
 src/kudu/mini-cluster/mini_cluster.h            |   4 +
 src/kudu/rpc/periodic.cc                        |   5 +
 src/kudu/rpc/periodic.h                         |   5 +-
 10 files changed, 206 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/d9eb7d4c/src/kudu/consensus/raft_consensus.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc
index 291c146..cb3118e 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc
@@ -599,7 +599,7 @@ Status RaftConsensus::BecomeReplicaUnlocked(boost::optional<MonoDelta> fd_delta)
 
   // Enable/disable leader failure detection if becoming VOTER/NON_VOTER replica
   // correspondingly.
-  ToggleFailureDetector(std::move(fd_delta));
+  UpdateFailureDetectorState(std::move(fd_delta));
 
   // Now that we're a replica, we can allow voting for other nodes.
   withhold_votes_until_ = MonoTime::Min();
@@ -2610,7 +2610,7 @@ void RaftConsensus::CompleteConfigChangeRoundUnlocked(ConsensusRound* round, con
 
       // Disable leader failure detection if transitioning from VOTER to
       // NON_VOTER and vice versa.
-      ToggleFailureDetector();
+      UpdateFailureDetectorState();
     } else {
       LOG_WITH_PREFIX_UNLOCKED(INFO)
           << "Skipping abort of non-pending config change with OpId "
@@ -2670,14 +2670,15 @@ void RaftConsensus::DisableFailureDetector() {
   }
 }
 
-void RaftConsensus::ToggleFailureDetector(boost::optional<MonoDelta> delta) {
-  if (IsRaftConfigVoter(peer_uuid(), cmeta_->ActiveConfig())) {
-    // A non-leader voter replica should run failure detector.
+void RaftConsensus::UpdateFailureDetectorState(boost::optional<MonoDelta> delta) {
+  const auto& uuid = peer_uuid();
+  if (uuid != cmeta_->leader_uuid() &&
+      cmeta_->IsVoterInConfig(uuid, ACTIVE_CONFIG)) {
+    // A voter that is not the leader should run the failure detector.
     EnableFailureDetector(std::move(delta));
   } else {
-    // A non-voter should not start leader elections. The leader failure
-    // detector should be re-enabled once the non-voter replica is promoted
-    // to voter replica.
+    // Otherwise, the local peer should not start leader elections
+    // (e.g. if it is the leader, a non-voter, a non-participant, etc).
     DisableFailureDetector();
   }
 }
@@ -2805,9 +2806,7 @@ Status RaftConsensus::SetPendingConfigUnlocked(const RaftConfigPB& new_config) {
   }
   cmeta_->set_pending_config(new_config);
 
-  // Disable leader failure detection if transitioning from VOTER to NON_VOTER
-  // and vice versa.
-  ToggleFailureDetector();
+  UpdateFailureDetectorState();
 
   return Status::OK();
 }

http://git-wip-us.apache.org/repos/asf/kudu/blob/d9eb7d4c/src/kudu/consensus/raft_consensus.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus.h b/src/kudu/consensus/raft_consensus.h
index 2c438d1..5ace791 100644
--- a/src/kudu/consensus/raft_consensus.h
+++ b/src/kudu/consensus/raft_consensus.h
@@ -158,6 +158,11 @@ class RaftConsensus : public std::enable_shared_from_this<RaftConsensus>,
   // Returns Status::TimedOut if the role is not LEADER within 'timeout'.
   Status WaitUntilLeaderForTests(const MonoDelta& timeout);
 
+  // Return a copy of the failure detector instance. Only for use in tests.
+  std::shared_ptr<rpc::PeriodicTimer> GetFailureDetectorForTests() const {
+    return failure_detector_;
+  }
+
   // Implement a LeaderStepDown() request.
   Status StepDown(LeaderStepDownResponsePB* resp);
 
@@ -582,13 +587,15 @@ class RaftConsensus : public std::enable_shared_from_this<RaftConsensus>,
   // If the failure detector is already disabled, has no effect.
   void DisableFailureDetector();
 
-  // Disable leader failure detector if transitioning from VOTER to NON_VOTER,
-  // and vice versa. The decision is based on the type of membership of the
-  // peer in the active Raft configuration.
+  // Enables or disables the failure detector based on the role of the local
+  // peer in the active config. If the local peer a VOTER, but not the leader,
+  // then failure detection will be enabled. If the local peer is the leader,
+  // or a NON_VOTER, then failure detection will be disabled.
   //
-  // If it's time to enable the leader failure detection, the specified
-  // 'delta' value is used as described in EnableFailureDetector()'s comment.
-  void ToggleFailureDetector(boost::optional<MonoDelta> delta = boost::none);
+  // See EnableFailureDetector() for an explanation of the 'delta' parameter,
+  // which is used if it is determined that the failure detector should be
+  // enabled.
+  void UpdateFailureDetectorState(boost::optional<MonoDelta> delta = boost::none);
 
   // "Reset" the failure detector to indicate leader activity.
   //

http://git-wip-us.apache.org/repos/asf/kudu/blob/d9eb7d4c/src/kudu/integration-tests/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/CMakeLists.txt b/src/kudu/integration-tests/CMakeLists.txt
index df5853b..a3f2294 100644
--- a/src/kudu/integration-tests/CMakeLists.txt
+++ b/src/kudu/integration-tests/CMakeLists.txt
@@ -86,9 +86,10 @@ ADD_KUDU_TEST(master-stress-test RESOURCE_LOCK "master-rpc-ports")
 ADD_KUDU_TEST(multidir_cluster-itest)
 ADD_KUDU_TEST(open-readonly-fs-itest)
 ADD_KUDU_TEST(raft_config_change-itest)
-ADD_KUDU_TEST(raft_consensus-itest RUN_SERIAL true)
 ADD_KUDU_TEST(raft_consensus_election-itest)
+ADD_KUDU_TEST(raft_consensus_failure_detector-imc-itest)
 ADD_KUDU_TEST(raft_consensus_nonvoter-itest)
+ADD_KUDU_TEST(raft_consensus-itest RUN_SERIAL true)
 ADD_KUDU_TEST(registration-test RESOURCE_LOCK "master-web-port")
 ADD_KUDU_TEST(security-faults-itest)
 ADD_KUDU_TEST(security-itest)

http://git-wip-us.apache.org/repos/asf/kudu/blob/d9eb7d4c/src/kudu/integration-tests/raft_consensus_failure_detector-imc-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus_failure_detector-imc-itest.cc b/src/kudu/integration-tests/raft_consensus_failure_detector-imc-itest.cc
new file mode 100644
index 0000000..32acff3
--- /dev/null
+++ b/src/kudu/integration-tests/raft_consensus_failure_detector-imc-itest.cc
@@ -0,0 +1,144 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "kudu/consensus/metadata.pb.h"
+#include "kudu/consensus/raft_consensus.h"
+#include "kudu/gutil/ref_counted.h"
+#include "kudu/integration-tests/cluster_itest_util.h"
+#include "kudu/integration-tests/internal_mini_cluster-itest-base.h"
+#include "kudu/integration-tests/test_workload.h"
+#include "kudu/mini-cluster/internal_mini_cluster.h"
+#include "kudu/rpc/periodic.h"
+#include "kudu/tablet/tablet_replica.h"
+#include "kudu/tserver/mini_tablet_server.h"
+#include "kudu/tserver/tablet_server.h"
+#include "kudu/tserver/ts_tablet_manager.h"
+#include "kudu/util/monotime.h"
+#include "kudu/util/status.h"
+#include "kudu/util/test_macros.h"
+#include "kudu/util/test_util.h"
+
+using kudu::consensus::RaftPeerPB;
+using kudu::itest::WaitForServersToAgree;
+using kudu::tablet::TabletReplica;
+using std::string;
+using std::vector;
+
+namespace kudu {
+
+class RaftConsensusFailureDetectorIMCTest : public MiniClusterITestBase {
+};
+
+// Ensure that the failure detector is enabled for non-leader voters, and
+// disabled for leaders and non-voters. Ensure this persists after a
+// configuration change.
+// Regression test for KUDU-2229.
+TEST_F(RaftConsensusFailureDetectorIMCTest, TestFailureDetectorActivation) {
+  if (!AllowSlowTests()) {
+    LOG(WARNING) << "Skipping test in fast-test mode.";
+    return;
+  }
+
+  const MonoDelta kTimeout = MonoDelta::FromSeconds(30);
+
+  const int kNumReplicas = 3;
+  NO_FATALS(StartCluster(/*num_tablet_servers=*/ kNumReplicas + 1));
+  TestWorkload workload(cluster_.get());
+  workload.Setup();
+
+  // Identify the tablet id and locate the replicas.
+  string tablet_id;
+  itest::TabletReplicaMap replica_map_;
+  string missing_replica_uuid;
+  ASSERT_EVENTUALLY([&] {
+    replica_map_.clear();
+    missing_replica_uuid.clear();
+    for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
+      auto* ts = cluster_->mini_tablet_server(i);
+      auto tablet_ids = ts->ListTablets();
+      if (tablet_ids.empty()) {
+        missing_replica_uuid = ts->uuid();
+        continue;
+      }
+      ASSERT_EQ(1, tablet_ids.size());
+      if (!tablet_id.empty()) {
+        ASSERT_EQ(tablet_id, tablet_ids[0]);
+      }
+      tablet_id = tablet_ids[0];
+      replica_map_.emplace(tablet_id, ts_map_[ts->uuid()]);
+    }
+    ASSERT_EQ(kNumReplicas, replica_map_.count(tablet_id));
+    ASSERT_FALSE(tablet_id.empty());
+  });
+
+  // Wait until tablets are running.
+  auto range = replica_map_.equal_range(tablet_id);
+  for (auto it = range.first; it != range.second; ++it) {
+    auto ts = it->second;
+    ASSERT_OK(WaitUntilTabletRunning(ts, tablet_id, kTimeout));
+  }
+  // Elect a leader.
+  string leader_uuid = range.first->second->uuid();
+  ASSERT_OK(StartElection(ts_map_[leader_uuid], tablet_id, kTimeout));
+  ASSERT_OK(WaitUntilLeader(ts_map_[leader_uuid], tablet_id, kTimeout));
+
+  auto active_ts_map = ts_map_;
+  ASSERT_EQ(1, active_ts_map.erase(missing_replica_uuid));
+
+  // Ensure all servers are up to date.
+  ASSERT_OK(WaitForServersToAgree(kTimeout, active_ts_map, tablet_id,
+                                  /*minimum_index=*/ 1));
+
+  // Ensure that the failure detector activation state is consistent with the
+  // rule that only non-leader VOTER replicas should have failure detection
+  // enabled. Leaders and NON_VOTER replicas should not enable the FD.
+  auto validate_failure_detector_status = [&](const itest::TabletServerMap& ts_map) {
+    for (const auto& entry : ts_map) {
+      const auto& uuid = entry.first;
+      auto mini_ts = cluster_->mini_tablet_server_by_uuid(uuid);
+      vector<scoped_refptr<TabletReplica>> replicas;
+      mini_ts->server()->tablet_manager()->GetTabletReplicas(&replicas);
+      ASSERT_EQ(1, replicas.size());
+      const auto& replica = replicas[0];
+      ASSERT_EQ(replica->consensus()->role() == RaftPeerPB::FOLLOWER,
+                replica->consensus()->GetFailureDetectorForTests()->started());
+    }
+  };
+
+  NO_FATALS(validate_failure_detector_status(active_ts_map));
+
+  // Add a new non-voter.
+  ASSERT_OK(AddServer(ts_map_[leader_uuid], tablet_id,
+                      ts_map_[missing_replica_uuid],
+                      RaftPeerPB::NON_VOTER, kTimeout));
+  ASSERT_OK(WaitForServersToAgree(kTimeout, ts_map_, tablet_id,
+                                  /*minimum_index=*/ 2));
+
+  NO_FATALS(validate_failure_detector_status(ts_map_));
+}
+
+} // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/d9eb7d4c/src/kudu/mini-cluster/external_mini_cluster.h
----------------------------------------------------------------------
diff --git a/src/kudu/mini-cluster/external_mini_cluster.h b/src/kudu/mini-cluster/external_mini_cluster.h
index f879949..d969527 100644
--- a/src/kudu/mini-cluster/external_mini_cluster.h
+++ b/src/kudu/mini-cluster/external_mini_cluster.h
@@ -232,9 +232,7 @@ class ExternalMiniCluster : public MiniCluster {
   // Return ExternalTabletServer given its UUID. If not found, returns NULL.
   ExternalTabletServer* tablet_server_by_uuid(const std::string& uuid) const;
 
-  // Return the index of the ExternalTabletServer that has the given 'uuid', or
-  // -1 if no such UUID can be found.
-  int tablet_server_index_by_uuid(const std::string& uuid) const;
+  int tablet_server_index_by_uuid(const std::string& uuid) const override;
 
   // Return all tablet servers and masters.
   std::vector<ExternalDaemon*> daemons() const;

http://git-wip-us.apache.org/repos/asf/kudu/blob/d9eb7d4c/src/kudu/mini-cluster/internal_mini_cluster.cc
----------------------------------------------------------------------
diff --git a/src/kudu/mini-cluster/internal_mini_cluster.cc b/src/kudu/mini-cluster/internal_mini_cluster.cc
index dbf7f6a..cfd0474 100644
--- a/src/kudu/mini-cluster/internal_mini_cluster.cc
+++ b/src/kudu/mini-cluster/internal_mini_cluster.cc
@@ -230,6 +230,24 @@ MiniTabletServer* InternalMiniCluster::mini_tablet_server(int idx) const {
   return mini_tablet_servers_[idx].get();
 }
 
+MiniTabletServer* InternalMiniCluster::mini_tablet_server_by_uuid(const string& uuid) const {
+  for (const auto& ts : mini_tablet_servers_) {
+    if (ts->uuid() == uuid) {
+      return ts.get();
+    }
+  }
+  return nullptr;
+}
+
+int InternalMiniCluster::tablet_server_index_by_uuid(const std::string& uuid) const {
+  for (int i = 0; i < mini_tablet_servers_.size(); i++) {
+    if (mini_tablet_servers_[i]->uuid() == uuid) {
+      return i;
+    }
+  }
+  return -1;
+}
+
 vector<HostPort> InternalMiniCluster::master_rpc_addrs() const {
   if (opts_.num_masters == 1) {
     const auto& addr = CHECK_NOTNULL(mini_master(0))->bound_rpc_addr();

http://git-wip-us.apache.org/repos/asf/kudu/blob/d9eb7d4c/src/kudu/mini-cluster/internal_mini_cluster.h
----------------------------------------------------------------------
diff --git a/src/kudu/mini-cluster/internal_mini_cluster.h b/src/kudu/mini-cluster/internal_mini_cluster.h
index b44717c..41eacdb 100644
--- a/src/kudu/mini-cluster/internal_mini_cluster.h
+++ b/src/kudu/mini-cluster/internal_mini_cluster.h
@@ -141,6 +141,11 @@ class InternalMiniCluster : public MiniCluster {
   // 'idx' must be between 0 and 'num_tablet_servers' -1.
   tserver::MiniTabletServer* mini_tablet_server(int idx) const;
 
+  // Returns the TabletServer with uuid 'uuid', or nullptr if not found.
+  tserver::MiniTabletServer* mini_tablet_server_by_uuid(const std::string& uuid) const;
+
+  int tablet_server_index_by_uuid(const std::string& uuid) const override;
+
   int num_tablet_servers() const override {
     return mini_tablet_servers_.size();
   }

http://git-wip-us.apache.org/repos/asf/kudu/blob/d9eb7d4c/src/kudu/mini-cluster/mini_cluster.h
----------------------------------------------------------------------
diff --git a/src/kudu/mini-cluster/mini_cluster.h b/src/kudu/mini-cluster/mini_cluster.h
index 8a1440c..f802c08 100644
--- a/src/kudu/mini-cluster/mini_cluster.h
+++ b/src/kudu/mini-cluster/mini_cluster.h
@@ -129,6 +129,10 @@ class MiniCluster {
 
   virtual std::vector<HostPort> master_rpc_addrs() const = 0;
 
+  // Return the index of the tablet server that has the given 'uuid', or
+  // -1 if no such UUID can be found.
+  virtual int tablet_server_index_by_uuid(const std::string& uuid) const = 0;
+
   // Create a client configured to talk to this cluster. 'builder' may contain
   // override options for the client. The master address will be overridden to
   // talk to the running master(s). If 'builder' is a nullptr, default options

http://git-wip-us.apache.org/repos/asf/kudu/blob/d9eb7d4c/src/kudu/rpc/periodic.cc
----------------------------------------------------------------------
diff --git a/src/kudu/rpc/periodic.cc b/src/kudu/rpc/periodic.cc
index 07674de..3b9412d 100644
--- a/src/kudu/rpc/periodic.cc
+++ b/src/kudu/rpc/periodic.cc
@@ -117,6 +117,11 @@ void PeriodicTimer::SnoozeUnlocked(boost::optional<MonoDelta> next_task_delta) {
   next_task_time_ = MonoTime::Now() + *next_task_delta;
 }
 
+bool PeriodicTimer::started() const {
+  std::lock_guard<simple_spinlock> l(lock_);
+  return started_;
+}
+
 MonoDelta PeriodicTimer::GetMinimumPeriod() {
   // Given jitter percentage J and period P, this returns (1-J)*P, which is
   // the lowest possible jittered value.

http://git-wip-us.apache.org/repos/asf/kudu/blob/d9eb7d4c/src/kudu/rpc/periodic.h
----------------------------------------------------------------------
diff --git a/src/kudu/rpc/periodic.h b/src/kudu/rpc/periodic.h
index f65ed85..058e3ce 100644
--- a/src/kudu/rpc/periodic.h
+++ b/src/kudu/rpc/periodic.h
@@ -127,7 +127,7 @@ class PeriodicTimer : public std::enable_shared_from_this<PeriodicTimer> {
   // otherwise the task is not guaranteed to run in a timely manner.
   //
   // Note: Snooze() is not additive. That is, if called at time X and again at
-  // time X + P/2, the timer is snoozed until X+P/2, not X+2P.
+  // time X + P/2, the timer is snoozed until X+P/2+P, not X+2P.
   //
   // Does nothing if the timer is stopped.
   void Snooze(boost::optional<MonoDelta> next_task_delta = boost::none);
@@ -141,6 +141,9 @@ class PeriodicTimer : public std::enable_shared_from_this<PeriodicTimer> {
   // Does nothing if the timer is already stopped.
   void Stop();
 
+  // Returns true iff the failure detected has been started.
+  bool started() const;
+
  private:
   FRIEND_TEST(PeriodicTimerTest, TestCallbackRestartsTimer);