You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2016/10/18 20:13:06 UTC

[1/2] kudu git commit: KUDU-1135 (part 2): avoid flushing metadata twice when starting an election

Repository: kudu
Updated Branches:
  refs/heads/master 45c1512fc -> b3ab84979


KUDU-1135 (part 2): avoid flushing metadata twice when starting an election

Change-Id: I231273a1cfa92275788dd99c78e284ecd0543d7a
Reviewed-on: http://gerrit.cloudera.org:8080/4702
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <to...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/717c70d5
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/717c70d5
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/717c70d5

Branch: refs/heads/master
Commit: 717c70d59552195d90960a3e45f9bdcd5c08af98
Parents: 45c1512
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Oct 12 14:59:35 2016 -0700
Committer: Todd Lipcon <to...@apache.org>
Committed: Tue Oct 18 05:19:54 2016 +0000

----------------------------------------------------------------------
 src/kudu/consensus/raft_consensus.cc             | 15 ++++++---------
 src/kudu/consensus/raft_consensus.h              |  3 ---
 src/kudu/consensus/raft_consensus_quorum-test.cc |  6 ++++++
 3 files changed, 12 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/717c70d5/src/kudu/consensus/raft_consensus.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc
index 4c3de3d..ac12ec7 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc
@@ -350,7 +350,7 @@ Status RaftConsensus::EmulateElection() {
   LOG_WITH_PREFIX_UNLOCKED(INFO) << "Emulating election...";
 
   // Assume leadership of new term.
-  RETURN_NOT_OK(IncrementTermUnlocked());
+  RETURN_NOT_OK(HandleTermAdvanceUnlocked(state_->GetCurrentTermUnlocked() + 1));
   SetLeaderUuidUnlocked(state_->GetPeerUuid());
   return BecomeLeaderUnlocked();
 }
@@ -419,11 +419,12 @@ Status RaftConsensus::StartElection(ElectionMode mode, ElectionReason reason) {
 
     // Increment the term and vote for ourselves, unless it's a pre-election.
     if (mode != PRE_ELECTION) {
-      // TODO(todd): the IncrementTermUnlocked call flushes to disk once, and then
-      // the SetVotedForCurrentTerm flushes again. We should avoid flushing to disk
-      // on the term bump.
       // TODO(mpercy): Consider using a separate Mutex for voting, which must sync to disk.
-      RETURN_NOT_OK(IncrementTermUnlocked());
+
+      // We skip flushing the term to disk because setting the vote just below also
+      // flushes to disk, and the double fsync doesn't buy us anything.
+      RETURN_NOT_OK(HandleTermAdvanceUnlocked(state_->GetCurrentTermUnlocked() + 1,
+                                              ReplicaState::SKIP_FLUSH_TO_DISK));
       RETURN_NOT_OK(state_->SetVotedForCurrentTermUnlocked(state_->GetPeerUuid()));
     }
 
@@ -2086,10 +2087,6 @@ MonoDelta RaftConsensus::LeaderElectionExpBackoffDeltaUnlocked() {
   return MonoDelta::FromMilliseconds(timeout);
 }
 
-Status RaftConsensus::IncrementTermUnlocked() {
-  return HandleTermAdvanceUnlocked(state_->GetCurrentTermUnlocked() + 1);
-}
-
 Status RaftConsensus::HandleTermAdvanceUnlocked(ConsensusTerm new_term,
                                                 ReplicaState::FlushToDisk flush) {
   if (new_term <= state_->GetCurrentTermUnlocked()) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/717c70d5/src/kudu/consensus/raft_consensus.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus.h b/src/kudu/consensus/raft_consensus.h
index 33bf7da..37f7f83 100644
--- a/src/kudu/consensus/raft_consensus.h
+++ b/src/kudu/consensus/raft_consensus.h
@@ -373,9 +373,6 @@ class RaftConsensus : public Consensus,
   // The maximum delta is capped by 'FLAGS_leader_failure_exp_backoff_max_delta_ms'.
   MonoDelta LeaderElectionExpBackoffDeltaUnlocked();
 
-  // Increment the term to the next term, resetting the current leader, etc.
-  Status IncrementTermUnlocked();
-
   // Handle when the term has advanced beyond the current term.
   //
   // 'flush' may be used to control whether the term change is flushed to disk.

http://git-wip-us.apache.org/repos/asf/kudu/blob/717c70d5/src/kudu/consensus/raft_consensus_quorum-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus_quorum-test.cc b/src/kudu/consensus/raft_consensus_quorum-test.cc
index 2ca8e1b..d9336ac 100644
--- a/src/kudu/consensus/raft_consensus_quorum-test.cc
+++ b/src/kudu/consensus/raft_consensus_quorum-test.cc
@@ -896,11 +896,17 @@ TEST_F(RaftConsensusQuorumTest, TestLeaderElectionWithQuiescedQuorum) {
 
     // This will force an election in which we expect to make the last
     // non-shutdown peer in the list become leader.
+    int flush_count_before = new_leader->GetReplicaStateForTests()
+        ->consensus_metadata_for_tests()->flush_count_for_tests();
     LOG(INFO) << "Running election for future leader with index " << (current_config_size - 1);
     ASSERT_OK(new_leader->StartElection(Consensus::ELECT_EVEN_IF_LEADER_IS_ALIVE,
                                         Consensus::EXTERNAL_REQUEST));
     WaitUntilLeaderForTests(new_leader.get());
     LOG(INFO) << "Election won";
+    int flush_count_after = new_leader->GetReplicaStateForTests()
+        ->consensus_metadata_for_tests()->flush_count_for_tests();
+    ASSERT_EQ(flush_count_after, flush_count_before + 1)
+        << "Expected only one consensus metadata flush for a leader election";
 
     // ... replicating a set of messages to the new leader should now be possible.
     REPLICATE_SEQUENCE_OF_MESSAGES(10,


[2/2] kudu git commit: consensus_peers: a little cleanup of cruft

Posted by ad...@apache.org.
consensus_peers: a little cleanup of cruft

* removes a test method that no longer was necessary
* removes an unimplemented method
* cleans up comments
* makes PeerManager methods non-virtual now that we no longer try to
  mock them

In particular I removed a big "state diagram" that I didn't find very
helpful and replaced it with a bit more text.

This is in prep for some work on KUDU-699.

Change-Id: Ie01c24b840e456482c12f96f3e2bcb3ad4388f0b
Reviewed-on: http://gerrit.cloudera.org:8080/4704
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <to...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/b3ab8497
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/b3ab8497
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/b3ab8497

Branch: refs/heads/master
Commit: b3ab84979bf6908e0c330503d908f70b6419f879
Parents: 717c70d
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Oct 12 15:41:34 2016 -0700
Committer: Todd Lipcon <to...@apache.org>
Committed: Tue Oct 18 05:20:08 2016 +0000

----------------------------------------------------------------------
 src/kudu/consensus/consensus_peers-test.cc |  4 --
 src/kudu/consensus/consensus_peers.cc      |  3 --
 src/kudu/consensus/consensus_peers.h       | 70 ++++++-------------------
 src/kudu/consensus/peer_manager.h          | 13 +++--
 4 files changed, 23 insertions(+), 67 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/b3ab8497/src/kudu/consensus/consensus_peers-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_peers-test.cc b/src/kudu/consensus/consensus_peers-test.cc
index 5a9d48d..ee8c6ab 100644
--- a/src/kudu/consensus/consensus_peers-test.cc
+++ b/src/kudu/consensus/consensus_peers-test.cc
@@ -149,10 +149,6 @@ TEST_F(ConsensusPeersTest, TestRemotePeer) {
   // Append a bunch of messages to the queue
   AppendReplicateMessagesToQueue(message_queue_.get(), clock_, 1, 20);
 
-  // The above append ends up appending messages in term 2, so we
-  // update the peer's term to match.
-  remote_peer->SetTermForTest(2);
-
   // signal the peer there are requests pending.
   remote_peer->SignalRequest();
   // now wait on the status of the last operation

http://git-wip-us.apache.org/repos/asf/kudu/blob/b3ab8497/src/kudu/consensus/consensus_peers.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_peers.cc b/src/kudu/consensus/consensus_peers.cc
index 819f139..92bb35a 100644
--- a/src/kudu/consensus/consensus_peers.cc
+++ b/src/kudu/consensus/consensus_peers.cc
@@ -114,9 +114,6 @@ Peer::Peer(const RaftPeerPB& peer_pb, string tablet_id, string leader_uuid,
       thread_pool_(thread_pool),
       state_(kPeerCreated) {}
 
-void Peer::SetTermForTest(int term) {
-  response_.set_responder_term(term);
-}
 
 Status Peer::Init() {
   std::lock_guard<simple_spinlock> lock(peer_lock_);

http://git-wip-us.apache.org/repos/asf/kudu/blob/b3ab8497/src/kudu/consensus/consensus_peers.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_peers.h b/src/kudu/consensus/consensus_peers.h
index c1a10cc..82fc8f9 100644
--- a/src/kudu/consensus/consensus_peers.h
+++ b/src/kudu/consensus/consensus_peers.h
@@ -55,57 +55,24 @@ class PeerMessageQueue;
 class VoteRequestPB;
 class VoteResponsePB;
 
-// A peer in consensus (local or remote).
+// A remote peer in consensus.
 //
-// Leaders use peers to update the local Log and remote replicas.
+// Leaders use peers to update the remote replicas. Each peer
+// may have at most one outstanding request at a time. If a
+// request is signaled when there is already one outstanding,
+// the request will be generated once the outstanding one finishes.
 //
 // Peers are owned by the consensus implementation and do not keep
-// state aside from whether there are requests pending or if requests
-// are being processed.
+// state aside from the most recent request and response.
 //
-// There are two external actions that trigger a state change:
-//
-// SignalRequest(): Called by the consensus implementation, notifies
-// that the queue contains messages to be processed.
-//
-// ProcessResponse() Called a response from a peer is received.
-//
-// The following state diagrams describe what happens when a state
-// changing method is called.
-//
-//                        +
-//                        |
-//       SignalRequest()  |
-//                        |
-//                        |
-//                        v
-//              +------------------+
-//       +------+    processing ?  +-----+
-//       |      +------------------+     |
-//       |                               |
-//       | Yes                           | No
-//       |                               |
-//       v                               v
-//     return                      ProcessNextRequest()
-//                                 processing = true
-//                                 - get reqs. from queue
-//                                 - update peer async
-//                                 return
-//
-//                         +
-//                         |
-//      ProcessResponse()  |
-//      processing = false |
-//                         v
-//               +------------------+
-//        +------+   more pending?  +-----+
-//        |      +------------------+     |
-//        |                               |
-//        | Yes                           | No
-//        |                               |
-//        v                               v
-//  SignalRequest()                    return
+// Peers are also responsible for sending periodic heartbeats
+// to assert liveness of the leader. The peer constructs a heartbeater
+// thread to trigger these heartbeats.
 //
+// The actual request construction is delegated to a PeerMessageQueue
+// object, and performed on a thread pool (since it may do IO). When a
+// response is received, the peer updates the PeerMessageQueue
+// using PeerMessageQueue::ResponseFromPeer(...) on the same threadpool.
 class Peer {
  public:
   // Initializes a peer and get its status.
@@ -119,15 +86,12 @@ class Peer {
 
   const RaftPeerPB& peer_pb() const { return peer_pb_; }
 
-  // Returns the PeerProxy if this is a remote peer or NULL if it
-  // isn't. Used for tests to fiddle with the proxy and emulate remote
-  // behavior.
-  PeerProxy* GetPeerProxyForTests();
-
+  // Stop sending requests and periodic heartbeats.
+  // TODO(KUDU-699). This currently blocks until the most recent request
+  // has completed, which is problematic.
   void Close();
 
-  void SetTermForTest(int term);
-
+  // Calls Close() automatically.
   ~Peer();
 
   // Creates a new remote peer and makes the queue track it.'

http://git-wip-us.apache.org/repos/asf/kudu/blob/b3ab8497/src/kudu/consensus/peer_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/peer_manager.h b/src/kudu/consensus/peer_manager.h
index 560f27f..ebbe5f5 100644
--- a/src/kudu/consensus/peer_manager.h
+++ b/src/kudu/consensus/peer_manager.h
@@ -41,9 +41,8 @@ class PeerMessageQueue;
 class PeerProxyFactory;
 class RaftConfigPB;
 
-// Manages the set of local and remote peers that pull data from the
-// queue into the local log/remote machines.
-// Methods are virtual to ease mocking.
+// Manages the remote peers that pull data from the local queue and send updates to the
+// remote machines.
 class PeerManager {
  public:
   // All of the raw pointer arguments are not owned by the PeerManager
@@ -58,16 +57,16 @@ class PeerManager {
               ThreadPool* request_thread_pool,
               const scoped_refptr<log::Log>& log);
 
-  virtual ~PeerManager();
+  ~PeerManager();
 
   // Updates 'peers_' according to the new configuration config.
-  virtual Status UpdateRaftConfig(const RaftConfigPB& config);
+  Status UpdateRaftConfig(const RaftConfigPB& config);
 
   // Signals all peers of the current configuration that there is a new request pending.
-  virtual void SignalRequest(bool force_if_queue_empty = false);
+  void SignalRequest(bool force_if_queue_empty = false);
 
   // Closes all peers.
-  virtual void Close();
+  void Close();
 
  private:
   std::string GetLogPrefix() const;