You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2016/10/18 20:13:06 UTC
[1/2] kudu git commit: KUDU-1135 (part 2): avoid flushing metadata
twice when starting an election
Repository: kudu
Updated Branches:
refs/heads/master 45c1512fc -> b3ab84979
KUDU-1135 (part 2): avoid flushing metadata twice when starting an election
Change-Id: I231273a1cfa92275788dd99c78e284ecd0543d7a
Reviewed-on: http://gerrit.cloudera.org:8080/4702
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <to...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/717c70d5
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/717c70d5
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/717c70d5
Branch: refs/heads/master
Commit: 717c70d59552195d90960a3e45f9bdcd5c08af98
Parents: 45c1512
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Oct 12 14:59:35 2016 -0700
Committer: Todd Lipcon <to...@apache.org>
Committed: Tue Oct 18 05:19:54 2016 +0000
----------------------------------------------------------------------
src/kudu/consensus/raft_consensus.cc | 15 ++++++---------
src/kudu/consensus/raft_consensus.h | 3 ---
src/kudu/consensus/raft_consensus_quorum-test.cc | 6 ++++++
3 files changed, 12 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/717c70d5/src/kudu/consensus/raft_consensus.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc
index 4c3de3d..ac12ec7 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc
@@ -350,7 +350,7 @@ Status RaftConsensus::EmulateElection() {
LOG_WITH_PREFIX_UNLOCKED(INFO) << "Emulating election...";
// Assume leadership of new term.
- RETURN_NOT_OK(IncrementTermUnlocked());
+ RETURN_NOT_OK(HandleTermAdvanceUnlocked(state_->GetCurrentTermUnlocked() + 1));
SetLeaderUuidUnlocked(state_->GetPeerUuid());
return BecomeLeaderUnlocked();
}
@@ -419,11 +419,12 @@ Status RaftConsensus::StartElection(ElectionMode mode, ElectionReason reason) {
// Increment the term and vote for ourselves, unless it's a pre-election.
if (mode != PRE_ELECTION) {
- // TODO(todd): the IncrementTermUnlocked call flushes to disk once, and then
- // the SetVotedForCurrentTerm flushes again. We should avoid flushing to disk
- // on the term bump.
// TODO(mpercy): Consider using a separate Mutex for voting, which must sync to disk.
- RETURN_NOT_OK(IncrementTermUnlocked());
+
+ // We skip flushing the term to disk because setting the vote just below also
+ // flushes to disk, and the double fsync doesn't buy us anything.
+ RETURN_NOT_OK(HandleTermAdvanceUnlocked(state_->GetCurrentTermUnlocked() + 1,
+ ReplicaState::SKIP_FLUSH_TO_DISK));
RETURN_NOT_OK(state_->SetVotedForCurrentTermUnlocked(state_->GetPeerUuid()));
}
@@ -2086,10 +2087,6 @@ MonoDelta RaftConsensus::LeaderElectionExpBackoffDeltaUnlocked() {
return MonoDelta::FromMilliseconds(timeout);
}
-Status RaftConsensus::IncrementTermUnlocked() {
- return HandleTermAdvanceUnlocked(state_->GetCurrentTermUnlocked() + 1);
-}
-
Status RaftConsensus::HandleTermAdvanceUnlocked(ConsensusTerm new_term,
ReplicaState::FlushToDisk flush) {
if (new_term <= state_->GetCurrentTermUnlocked()) {
http://git-wip-us.apache.org/repos/asf/kudu/blob/717c70d5/src/kudu/consensus/raft_consensus.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus.h b/src/kudu/consensus/raft_consensus.h
index 33bf7da..37f7f83 100644
--- a/src/kudu/consensus/raft_consensus.h
+++ b/src/kudu/consensus/raft_consensus.h
@@ -373,9 +373,6 @@ class RaftConsensus : public Consensus,
// The maximum delta is capped by 'FLAGS_leader_failure_exp_backoff_max_delta_ms'.
MonoDelta LeaderElectionExpBackoffDeltaUnlocked();
- // Increment the term to the next term, resetting the current leader, etc.
- Status IncrementTermUnlocked();
-
// Handle when the term has advanced beyond the current term.
//
// 'flush' may be used to control whether the term change is flushed to disk.
http://git-wip-us.apache.org/repos/asf/kudu/blob/717c70d5/src/kudu/consensus/raft_consensus_quorum-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus_quorum-test.cc b/src/kudu/consensus/raft_consensus_quorum-test.cc
index 2ca8e1b..d9336ac 100644
--- a/src/kudu/consensus/raft_consensus_quorum-test.cc
+++ b/src/kudu/consensus/raft_consensus_quorum-test.cc
@@ -896,11 +896,17 @@ TEST_F(RaftConsensusQuorumTest, TestLeaderElectionWithQuiescedQuorum) {
// This will force an election in which we expect to make the last
// non-shutdown peer in the list become leader.
+ int flush_count_before = new_leader->GetReplicaStateForTests()
+ ->consensus_metadata_for_tests()->flush_count_for_tests();
LOG(INFO) << "Running election for future leader with index " << (current_config_size - 1);
ASSERT_OK(new_leader->StartElection(Consensus::ELECT_EVEN_IF_LEADER_IS_ALIVE,
Consensus::EXTERNAL_REQUEST));
WaitUntilLeaderForTests(new_leader.get());
LOG(INFO) << "Election won";
+ int flush_count_after = new_leader->GetReplicaStateForTests()
+ ->consensus_metadata_for_tests()->flush_count_for_tests();
+ ASSERT_EQ(flush_count_after, flush_count_before + 1)
+ << "Expected only one consensus metadata flush for a leader election";
// ... replicating a set of messages to the new leader should now be possible.
REPLICATE_SEQUENCE_OF_MESSAGES(10,
[2/2] kudu git commit: consensus_peers: a little cleanup of cruft
Posted by ad...@apache.org.
consensus_peers: a little cleanup of cruft
* removes a test method that no longer was necessary
* removes an unimplemented method
* cleans up comments
* makes PeerManager methods non-virtual now that we no longer try to
mock them
In particular I removed a big "state diagram" that I didn't find very
helpful and replaced it with a bit more text.
This is in prep for some work on KUDU-699.
Change-Id: Ie01c24b840e456482c12f96f3e2bcb3ad4388f0b
Reviewed-on: http://gerrit.cloudera.org:8080/4704
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <to...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/b3ab8497
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/b3ab8497
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/b3ab8497
Branch: refs/heads/master
Commit: b3ab84979bf6908e0c330503d908f70b6419f879
Parents: 717c70d
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Oct 12 15:41:34 2016 -0700
Committer: Todd Lipcon <to...@apache.org>
Committed: Tue Oct 18 05:20:08 2016 +0000
----------------------------------------------------------------------
src/kudu/consensus/consensus_peers-test.cc | 4 --
src/kudu/consensus/consensus_peers.cc | 3 --
src/kudu/consensus/consensus_peers.h | 70 ++++++-------------------
src/kudu/consensus/peer_manager.h | 13 +++--
4 files changed, 23 insertions(+), 67 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/b3ab8497/src/kudu/consensus/consensus_peers-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_peers-test.cc b/src/kudu/consensus/consensus_peers-test.cc
index 5a9d48d..ee8c6ab 100644
--- a/src/kudu/consensus/consensus_peers-test.cc
+++ b/src/kudu/consensus/consensus_peers-test.cc
@@ -149,10 +149,6 @@ TEST_F(ConsensusPeersTest, TestRemotePeer) {
// Append a bunch of messages to the queue
AppendReplicateMessagesToQueue(message_queue_.get(), clock_, 1, 20);
- // The above append ends up appending messages in term 2, so we
- // update the peer's term to match.
- remote_peer->SetTermForTest(2);
-
// signal the peer there are requests pending.
remote_peer->SignalRequest();
// now wait on the status of the last operation
http://git-wip-us.apache.org/repos/asf/kudu/blob/b3ab8497/src/kudu/consensus/consensus_peers.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_peers.cc b/src/kudu/consensus/consensus_peers.cc
index 819f139..92bb35a 100644
--- a/src/kudu/consensus/consensus_peers.cc
+++ b/src/kudu/consensus/consensus_peers.cc
@@ -114,9 +114,6 @@ Peer::Peer(const RaftPeerPB& peer_pb, string tablet_id, string leader_uuid,
thread_pool_(thread_pool),
state_(kPeerCreated) {}
-void Peer::SetTermForTest(int term) {
- response_.set_responder_term(term);
-}
Status Peer::Init() {
std::lock_guard<simple_spinlock> lock(peer_lock_);
http://git-wip-us.apache.org/repos/asf/kudu/blob/b3ab8497/src/kudu/consensus/consensus_peers.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_peers.h b/src/kudu/consensus/consensus_peers.h
index c1a10cc..82fc8f9 100644
--- a/src/kudu/consensus/consensus_peers.h
+++ b/src/kudu/consensus/consensus_peers.h
@@ -55,57 +55,24 @@ class PeerMessageQueue;
class VoteRequestPB;
class VoteResponsePB;
-// A peer in consensus (local or remote).
+// A remote peer in consensus.
//
-// Leaders use peers to update the local Log and remote replicas.
+// Leaders use peers to update the remote replicas. Each peer
+// may have at most one outstanding request at a time. If a
+// request is signaled when there is already one outstanding,
+// the request will be generated once the outstanding one finishes.
//
// Peers are owned by the consensus implementation and do not keep
-// state aside from whether there are requests pending or if requests
-// are being processed.
+// state aside from the most recent request and response.
//
-// There are two external actions that trigger a state change:
-//
-// SignalRequest(): Called by the consensus implementation, notifies
-// that the queue contains messages to be processed.
-//
-// ProcessResponse() Called a response from a peer is received.
-//
-// The following state diagrams describe what happens when a state
-// changing method is called.
-//
-// +
-// |
-// SignalRequest() |
-// |
-// |
-// v
-// +------------------+
-// +------+ processing ? +-----+
-// | +------------------+ |
-// | |
-// | Yes | No
-// | |
-// v v
-// return ProcessNextRequest()
-// processing = true
-// - get reqs. from queue
-// - update peer async
-// return
-//
-// +
-// |
-// ProcessResponse() |
-// processing = false |
-// v
-// +------------------+
-// +------+ more pending? +-----+
-// | +------------------+ |
-// | |
-// | Yes | No
-// | |
-// v v
-// SignalRequest() return
+// Peers are also responsible for sending periodic heartbeats
+// to assert liveness of the leader. The peer constructs a heartbeater
+// thread to trigger these heartbeats.
//
+// The actual request construction is delegated to a PeerMessageQueue
+// object, and performed on a thread pool (since it may do IO). When a
+// response is received, the peer updates the PeerMessageQueue
+// using PeerMessageQueue::ResponseFromPeer(...) on the same threadpool.
class Peer {
public:
// Initializes a peer and get its status.
@@ -119,15 +86,12 @@ class Peer {
const RaftPeerPB& peer_pb() const { return peer_pb_; }
- // Returns the PeerProxy if this is a remote peer or NULL if it
- // isn't. Used for tests to fiddle with the proxy and emulate remote
- // behavior.
- PeerProxy* GetPeerProxyForTests();
-
+ // Stop sending requests and periodic heartbeats.
+ // TODO(KUDU-699). This currently blocks until the most recent request
+ // has completed, which is problematic.
void Close();
- void SetTermForTest(int term);
-
+ // Calls Close() automatically.
~Peer();
// Creates a new remote peer and makes the queue track it.'
http://git-wip-us.apache.org/repos/asf/kudu/blob/b3ab8497/src/kudu/consensus/peer_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/peer_manager.h b/src/kudu/consensus/peer_manager.h
index 560f27f..ebbe5f5 100644
--- a/src/kudu/consensus/peer_manager.h
+++ b/src/kudu/consensus/peer_manager.h
@@ -41,9 +41,8 @@ class PeerMessageQueue;
class PeerProxyFactory;
class RaftConfigPB;
-// Manages the set of local and remote peers that pull data from the
-// queue into the local log/remote machines.
-// Methods are virtual to ease mocking.
+// Manages the remote peers that pull data from the local queue and send updates to the
+// remote machines.
class PeerManager {
public:
// All of the raw pointer arguments are not owned by the PeerManager
@@ -58,16 +57,16 @@ class PeerManager {
ThreadPool* request_thread_pool,
const scoped_refptr<log::Log>& log);
- virtual ~PeerManager();
+ ~PeerManager();
// Updates 'peers_' according to the new configuration config.
- virtual Status UpdateRaftConfig(const RaftConfigPB& config);
+ Status UpdateRaftConfig(const RaftConfigPB& config);
// Signals all peers of the current configuration that there is a new request pending.
- virtual void SignalRequest(bool force_if_queue_empty = false);
+ void SignalRequest(bool force_if_queue_empty = false);
// Closes all peers.
- virtual void Close();
+ void Close();
private:
std::string GetLogPrefix() const;