You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by be...@apache.org on 2014/01/17 01:58:21 UTC
[01/10] Decoupled replicated log coordinator logic and made it
asynchronous.
Updated Branches:
refs/heads/master 2ff53088b -> 420e30bfe
http://git-wip-us.apache.org/repos/asf/mesos/blob/19ad88b7/src/log/replica.cpp
----------------------------------------------------------------------
diff --git a/src/log/replica.cpp b/src/log/replica.cpp
index 82c2157..032180a 100644
--- a/src/log/replica.cpp
+++ b/src/log/replica.cpp
@@ -25,7 +25,7 @@
#include <algorithm>
#include <process/dispatch.hpp>
-#include <process/protobuf.hpp>
+#include <process/id.hpp>
#include <stout/check.hpp>
#include <stout/error.hpp>
@@ -59,18 +59,17 @@ namespace protocol {
// Some replica protocol definitions.
Protocol<PromiseRequest, PromiseResponse> promise;
Protocol<WriteRequest, WriteResponse> write;
-Protocol<LearnRequest, LearnResponse> learn;
} // namespace protocol {
struct State
{
- uint64_t coordinator; // Last promise made to a coordinator.
+ uint64_t proposal; // Last promise made.
uint64_t begin; // Beginning position of the log.
uint64_t end; // Ending position of the log.
- std::set<uint64_t> learned; // Positions present and learned
- std::set<uint64_t> unlearned; // Positions present but unlearned.
+ set<uint64_t> learned; // Positions present and learned
+ set<uint64_t> unlearned; // Positions present but unlearned.
};
@@ -237,7 +236,7 @@ Try<State> LevelDBStorage::recover(const string& path)
LOG(INFO) << "Compacted db in " << stopwatch.elapsed();
State state;
- state.coordinator = 0;
+ state.proposal = 0;
state.begin = 0;
state.end = 0;
@@ -277,14 +276,14 @@ Try<State> LevelDBStorage::recover(const string& path)
switch (record.type()) {
case Record::METADATA: {
CHECK(record.has_metadata());
- state.coordinator = record.metadata().promised();
+ state.proposal = record.metadata().promised();
break;
}
// DEPRECATED!
case Record::PROMISE: {
CHECK(record.has_promise());
- state.coordinator = record.promise().id();
+ state.proposal = record.promise().proposal();
break;
}
@@ -484,7 +483,7 @@ class ReplicaProcess : public ProtobufProcess<ReplicaProcess>
public:
// Constructs a new replica process using specified path to a
// directory for storing the underlying log.
- ReplicaProcess(const std::string& path);
+ ReplicaProcess(const string& path);
virtual ~ReplicaProcess();
@@ -498,11 +497,15 @@ public:
// Returns all the actions between the specified positions, unless
// those positions are invalid, in which case returns an error.
- process::Future<std::list<Action> > read(uint64_t from, uint64_t to);
+ Future<list<Action> > read(uint64_t from, uint64_t to);
+
+ // Returns true if the specified position is missing in the log
+ // (i.e., unlearned or holes).
+ bool missing(uint64_t position);
// Returns missing positions in the log (i.e., unlearned or holes)
- // up to the specified position.
- std::set<uint64_t> missing(uint64_t position);
+ // within the specified range [from, to].
+ set<uint64_t> missing(uint64_t from, uint64_t to);
// Returns the beginning position of the log.
uint64_t beginning();
@@ -514,17 +517,13 @@ public:
uint64_t promised();
private:
- // Handles a request from a coordinator to promise not to accept
- // writes from any other coordinator.
+ // Handles a request from a proposer to promise not to accept writes
+ // from any other proposer with lower proposal number.
void promise(const PromiseRequest& request);
- // Handles a request from a coordinator to write an action.
+ // Handles a request from a proposer to write an action.
void write(const WriteRequest& request);
- // Handles a request from a coordinator (or replica) to learn the
- // specified position in the log.
- void learn(uint64_t position);
-
// Handles a message notifying of a learned action.
void learned(const Action& action);
@@ -534,13 +533,13 @@ private:
bool persist(const Action& action);
// Helper routine to recover log (e.g., on restart).
- void recover(const std::string& path);
+ void recover(const string& path);
// Underlying storage for the log.
Storage* storage;
- // Last promise made to a coordinator.
- uint64_t coordinator;
+ // Last promise made to a proposer.
+ uint64_t proposal;
// Beginning position of log (after *learned* truncations).
uint64_t begin;
@@ -549,19 +548,21 @@ private:
uint64_t end;
// Holes in the log.
- std::set<uint64_t> holes;
+ set<uint64_t> holes;
// Unlearned positions in the log.
- std::set<uint64_t> unlearned;
+ set<uint64_t> unlearned;
};
ReplicaProcess::ReplicaProcess(const string& path)
- : coordinator(0),
+ : ProcessBase(ID::generate("log-replica")),
+ proposal(0),
begin(0),
end(0)
{
- storage = new LevelDBStorage(); // TODO(benh): Factor out and expose storage.
+ // TODO(benh): Factor out and expose storage.
+ storage = new LevelDBStorage();
recover(path);
@@ -575,10 +576,6 @@ ReplicaProcess::ReplicaProcess(const string& path)
install<LearnedMessage>(
&ReplicaProcess::learned,
&LearnedMessage::action);
-
- install<LearnRequest>(
- &ReplicaProcess::learn,
- &LearnRequest::position);
}
@@ -613,9 +610,7 @@ Result<Action> ReplicaProcess::read(uint64_t position)
// TODO(benh): Make this function actually return a Try once we change
// the future semantics to not include failures.
-process::Future<list<Action> > ReplicaProcess::read(
- uint64_t from,
- uint64_t to)
+Future<list<Action> > ReplicaProcess::read(uint64_t from, uint64_t to)
{
if (to < from) {
process::Promise<list<Action> > promise;
@@ -649,26 +644,46 @@ process::Future<list<Action> > ReplicaProcess::read(
}
-set<uint64_t> ReplicaProcess::missing(uint64_t index)
+bool ReplicaProcess::missing(uint64_t position)
{
- // Start off with all the unlearned positions.
- set<uint64_t> positions = unlearned;
-
- // Add in a spoonful of holes.
- foreach (uint64_t hole, holes) {
- positions.insert(hole);
+ if (position < begin) {
+ return false; // Truncated positions are treated as learned.
+ } else if (position > end) {
+ return true;
+ } else {
+ if (unlearned.count(position) != 0 || holes.count(position) != 0) {
+ return true;
+ } else {
+ return false;
+ }
}
+}
- // And finally add all the unknown positions beyond our end.
- for (; index >= end; index--) {
- positions.insert(index);
- // Don't wrap around 0!
- if (index == 0) {
- break;
+set<uint64_t> ReplicaProcess::missing(uint64_t from, uint64_t to)
+{
+ // TODO(jieyu): Optimize the performence for the common case.
+ set<uint64_t> positions;
+
+ // Add unlearned positions.
+ foreach (uint64_t p, unlearned) {
+ if (p >= from && p <= to) {
+ positions.insert(p);
}
}
+ // Add holes.
+ foreach (uint64_t p, holes) {
+ if (p >= from && p <= to) {
+ positions.insert(p);
+ }
+ }
+
+ // Add all the unknown positions beyond our end.
+ for (; to > end; to--) {
+ positions.insert(to);
+ }
+
return positions;
}
@@ -687,54 +702,58 @@ uint64_t ReplicaProcess::ending()
uint64_t ReplicaProcess::promised()
{
- return coordinator;
+ return proposal;
}
// Note that certain failures that occur result in returning from the
-// current function but *NOT* sending a 'nack' back to the coordinator
-// because that implies a coordinator has been demoted. Not sending
+// current function but *NOT* sending a NACK back to the proposer
+// because that implies a proposer has been demoted. Not sending
// anything is equivalent to pretending like the request never made it
// here. TODO(benh): At some point, however, we might want to actually
// "fail" more dramatically because there could be something rather
-// seriously wrong on this box that we are ignoring (like a bad
-// disk). This could be accomplished by changing most LOG(ERROR)
-// statements to LOG(FATAL), or by counting the number of errors and
-// after reaching some threshold aborting. In addition, sending the
-// error information back to the coordinator "might" help the
-// debugging procedure.
+// seriously wrong on this box that we are ignoring (like a bad disk).
+// This could be accomplished by changing most LOG(ERROR) statements
+// to LOG(FATAL), or by counting the number of errors and after
+// reaching some threshold aborting. In addition, sending the error
+// information back to the proposer "might" help the debugging
+// procedure.
void ReplicaProcess::promise(const PromiseRequest& request)
{
if (request.has_position()) {
- LOG(INFO) << "Replica received explicit promise request for "
- << request.id() << " for position " << request.position();
-
- // If the position has been truncated, tell the coordinator that
- // it's a learned no-op. This can happen when a replica has missed
- // some truncates and it's coordinator tries to fill some
- // truncated positions on election. A learned no-op is safe since
- // the coordinator should eventually learn that this position was
+ LOG(INFO) << "Replica received explicit promise request for position "
+ << request.position() << " with proposal " << request.proposal();
+
+ // If the position has been truncated, tell the proposer that it's
+ // a learned no-op. This can happen when a replica has missed some
+ // truncates and it's proposer tries to fill some truncated
+ // positions on election. A learned no-op is safe since the
+ // proposer should eventually learn that this position was
// actually truncated. The action must be _learned_ so that the
- // coordinator doesn't attempt to run a full Paxos round which
- // will never succeed because this replica will not permit the
- // write (because ReplicaProcess::write "ignores" writes on
- // truncated positions).
+ // proposer doesn't attempt to run a full Paxos round which will
+ // never succeed because this replica will not permit the write
+ // (because ReplicaProcess::write "ignores" writes on truncated
+ // positions).
+ // TODO(jieyu): Think about whether we need to check proposal
+ // number so that we don't reply a proposer whose number is
+ // obviously smaller than most of the proposers in the system.
if (request.position() < begin) {
Action action;
action.set_position(request.position());
- action.set_promised(coordinator); // Use the last coordinator.
- action.set_performed(coordinator); // Use the last coordinator.
+ action.set_promised(proposal); // Use the last promised proposal.
+ action.set_performed(proposal); // Use the last promised proposal.
action.set_learned(true);
action.set_type(Action::NOP);
action.mutable_nop()->MergeFrom(Action::Nop());
PromiseResponse response;
response.set_okay(true);
- response.set_id(request.id());
+ response.set_proposal(request.proposal());
response.mutable_action()->MergeFrom(action);
reply(response);
+ return;
}
// Need to get the action for the specified position.
@@ -744,63 +763,86 @@ void ReplicaProcess::promise(const PromiseRequest& request)
LOG(ERROR) << "Error getting log record at " << request.position()
<< ": " << result.error();
} else if (result.isNone()) {
- Action action;
- action.set_position(request.position());
- action.set_promised(request.id());
-
- if (persist(action)) {
+ // This position has been implicitly promised to a proposer.
+ // Therefore, we should no longer give promise to a proposer
+ // with a lower (or equal) proposal number. If not, we may
+ // accept writes from both proposers, causing a potential
+ // inconsistency in the log. For example, there are three
+ // replicas R1, R2 and R3. Assume that log position 1 in all
+ // replicas are implicitly promised to proposer 2. Later,
+ // proposer 1 asks for explicit promises from R2 and R3 for log
+ // position 1. If we don't perform the following check, R2 and
+ // R3 will give their promises to R2 and R3 for log position 1.
+ // As a result, proposer 1 can successfully write a value X to
+ // log position 1 and thinks that X is agreed, while proposer 2
+ // can later write a value Y and also believes that Y is agreed.
+ if (request.proposal() <= proposal) {
+ // If a promise request is rejected because of the proposal
+ // number check, we reply with the currently promised proposal
+ // number so that the proposer can bump its proposal number
+ // and retry if needed to ensure liveness.
PromiseResponse response;
- response.set_okay(true);
- response.set_id(request.id());
- response.set_position(request.position());
+ response.set_okay(false);
+ response.set_proposal(proposal);
reply(response);
+ } else {
+ Action action;
+ action.set_position(request.position());
+ action.set_promised(request.proposal());
+
+ if (persist(action)) {
+ PromiseResponse response;
+ response.set_okay(true);
+ response.set_proposal(request.proposal());
+ response.set_position(request.position());
+ reply(response);
+ }
}
} else {
CHECK_SOME(result);
Action action = result.get();
- CHECK(action.position() == request.position());
+ CHECK_EQ(action.position(), request.position());
- if (request.id() < action.promised()) {
+ if (request.proposal() <= action.promised()) {
PromiseResponse response;
response.set_okay(false);
- response.set_id(request.id());
- response.set_position(request.position());
+ response.set_proposal(action.promised());
reply(response);
} else {
Action original = action;
- action.set_promised(request.id());
+ action.set_promised(request.proposal());
if (persist(action)) {
PromiseResponse response;
response.set_okay(true);
- response.set_id(request.id());
+ response.set_proposal(request.proposal());
response.mutable_action()->MergeFrom(original);
reply(response);
}
}
}
} else {
- LOG(INFO) << "Replica received implicit promise request for "
- << request.id();
+ LOG(INFO) << "Replica received implicit promise request with proposal "
+ << request.proposal();
- if (request.id() <= coordinator) { // Only make an implicit promise once!
- LOG(INFO) << "Replica denying promise request for "
- << request.id();
+ if (request.proposal() <= proposal) { // Only make an implicit promise once!
+ LOG(INFO) << "Replica denying promise request with proposal "
+ << request.proposal();
PromiseResponse response;
response.set_okay(false);
- response.set_id(request.id());
+ response.set_proposal(proposal);
reply(response);
} else {
Promise promise;
- promise.set_id(request.id());
+ promise.set_proposal(request.proposal());
if (persist(promise)) {
- coordinator = request.id();
+ proposal = request.proposal();
// Return the last position written.
PromiseResponse response;
response.set_okay(true);
- response.set_id(request.id());
+ response.set_proposal(request.proposal());
response.set_position(end);
reply(response);
}
@@ -811,7 +853,8 @@ void ReplicaProcess::promise(const PromiseRequest& request)
void ReplicaProcess::write(const WriteRequest& request)
{
- LOG(INFO) << "Replica received write request for position " << request.position();
+ LOG(INFO) << "Replica received write request for position "
+ << request.position();
Result<Action> result = read(request.position());
@@ -819,17 +862,17 @@ void ReplicaProcess::write(const WriteRequest& request)
LOG(ERROR) << "Error getting log record at " << request.position()
<< ": " << result.error();
} else if (result.isNone()) {
- if (request.id() < coordinator) {
+ if (request.proposal() < proposal) {
WriteResponse response;
response.set_okay(false);
- response.set_id(request.id());
+ response.set_proposal(proposal);
response.set_position(request.position());
reply(response);
} else {
Action action;
action.set_position(request.position());
- action.set_promised(coordinator);
- action.set_performed(request.id());
+ action.set_promised(proposal);
+ action.set_performed(request.proposal());
if (request.has_learned()) action.set_learned(request.learned());
action.set_type(request.type());
@@ -840,11 +883,11 @@ void ReplicaProcess::write(const WriteRequest& request)
break;
case Action::APPEND:
CHECK(request.has_append());
- action.mutable_append()->MergeFrom(request.append());
+ action.mutable_append()->CopyFrom(request.append());
break;
case Action::TRUNCATE:
CHECK(request.has_truncate());
- action.mutable_truncate()->MergeFrom(request.truncate());
+ action.mutable_truncate()->CopyFrom(request.truncate());
break;
default:
LOG(FATAL) << "Unknown Action::Type!";
@@ -853,25 +896,42 @@ void ReplicaProcess::write(const WriteRequest& request)
if (persist(action)) {
WriteResponse response;
response.set_okay(true);
- response.set_id(request.id());
+ response.set_proposal(request.proposal());
response.set_position(request.position());
reply(response);
}
}
} else if (result.isSome()) {
Action action = result.get();
- CHECK(action.position() == request.position());
+ CHECK_EQ(action.position(), request.position());
- if (request.id() < action.promised()) {
+ if (request.proposal() < action.promised()) {
WriteResponse response;
response.set_okay(false);
- response.set_id(request.id());
+ response.set_proposal(action.promised());
response.set_position(request.position());
reply(response);
} else {
// TODO(benh): Check if this position has already been learned,
// and if so, check that we are re-writing the same value!
- action.set_performed(request.id());
+ //
+ // TODO(jieyu): Interestingly, in the presence of truncations,
+ // we may encounter a situation where this position has already
+ // been learned, but we are re-writing a different value. For
+ // example, assume that there are 5 replicas (R1 ~ R5). First,
+ // an append operation has been agreed at position 5 by R1, R2,
+ // R3 and R4, but only R1 receives a learned message. Later, a
+ // truncate operation has been agreed at position 10 by R1, R2
+ // and R3, but only R1 receives a learned message. Now, a leader
+ // failover happens and R5 is filled with a NOP at position 5
+ // because its coordinator receives a learned NOP at position 5
+ // from R1 (because of its learned truncation at position 10).
+ // Now, another leader failover happens and R4's coordinator
+ // tries to fill position 5. However, it is only able to contact
+ // R2, R3 and R4 during the explicit promise phase. As a result,
+ // it will try to write an append operation at position 5 to R5
+ // while R5 currently have a learned NOP stored at position 5.
+ action.set_performed(request.proposal());
action.clear_learned();
if (request.has_learned()) action.set_learned(request.learned());
action.clear_type();
@@ -887,11 +947,11 @@ void ReplicaProcess::write(const WriteRequest& request)
break;
case Action::APPEND:
CHECK(request.has_append());
- action.mutable_append()->MergeFrom(request.append());
+ action.mutable_append()->CopyFrom(request.append());
break;
case Action::TRUNCATE:
CHECK(request.has_truncate());
- action.mutable_truncate()->MergeFrom(request.truncate());
+ action.mutable_truncate()->CopyFrom(request.truncate());
break;
default:
LOG(FATAL) << "Unknown Action::Type!";
@@ -900,7 +960,7 @@ void ReplicaProcess::write(const WriteRequest& request)
if (persist(action)) {
WriteResponse response;
response.set_okay(true);
- response.set_id(request.id());
+ response.set_proposal(request.proposal());
response.set_position(request.position());
reply(response);
}
@@ -911,42 +971,18 @@ void ReplicaProcess::write(const WriteRequest& request)
void ReplicaProcess::learned(const Action& action)
{
- LOG(INFO) << "Replica received learned notice for position " << action.position();
+ LOG(INFO) << "Replica received learned notice for position "
+ << action.position();
CHECK(action.learned());
if (persist(action)) {
- LOG(INFO) << "Replica learned "
- << Action::Type_Name(action.type())
+ LOG(INFO) << "Replica learned " << Action::Type_Name(action.type())
<< " action at position " << action.position();
}
}
-void ReplicaProcess::learn(uint64_t position)
-{
- LOG(INFO) << "Replica received learn request for position " << position;
-
- Result<Action> result = read(position);
-
- if (result.isError()) {
- LOG(ERROR) << "Error getting log record at " << position
- << ": " << result.error();
- } else if (result.isSome() &&
- result.get().has_learned() &&
- result.get().learned()) {
- LearnResponse response;
- response.set_okay(true);
- response.mutable_action()->MergeFrom(result.get());
- reply(response);
- } else {
- LearnResponse response;
- response.set_okay(false);
- reply(response);
- }
-}
-
-
bool ReplicaProcess::persist(const Promise& promise)
{
Try<Nothing> persisted = storage->persist(promise);
@@ -956,7 +992,7 @@ bool ReplicaProcess::persist(const Promise& promise)
return false;
}
- LOG(INFO) << "Persisted promise to " << promise.id();
+ LOG(INFO) << "Persisted promise to " << promise.proposal();
return true;
}
@@ -999,6 +1035,9 @@ bool ReplicaProcess::persist(const Action& action)
// And update the beginning position.
begin = std::max(begin, action.truncate().to());
}
+ } else {
+ // We just introduced an unlearned position.
+ unlearned.insert(action.position());
}
// Update holes if we just wrote many positions past the last end.
@@ -1020,13 +1059,13 @@ void ReplicaProcess::recover(const string& path)
CHECK_SOME(state) << "Failed to recover the log";
// Pull out and save some of the state.
- coordinator = state.get().coordinator;
+ proposal = state.get().proposal;
begin = state.get().begin;
end = state.get().end;
unlearned = state.get().unlearned;
// Only use the learned positions to help determine the holes.
- const std::set<uint64_t>& learned = state.get().learned;
+ const set<uint64_t>& learned = state.get().learned;
// We need to assume that position 0 is a hole for a brand new log
// (a coordinator will simply fill it with a no-op when it first
@@ -1050,54 +1089,58 @@ void ReplicaProcess::recover(const string& path)
}
-Replica::Replica(const std::string& path)
+Replica::Replica(const string& path)
{
process = new ReplicaProcess(path);
- process::spawn(process);
+ spawn(process);
}
Replica::~Replica()
{
- process::terminate(process);
+ terminate(process);
process::wait(process);
delete process;
}
-process::Future<std::list<Action> > Replica::read(
- uint64_t from,
- uint64_t to)
+Future<list<Action> > Replica::read(uint64_t from, uint64_t to) const
+{
+ return dispatch(process, &ReplicaProcess::read, from, to);
+}
+
+
+Future<bool> Replica::missing(uint64_t position) const
{
- return process::dispatch(process, &ReplicaProcess::read, from, to);
+ return dispatch(process, &ReplicaProcess::missing, position);
}
-process::Future<std::set<uint64_t> > Replica::missing(uint64_t position)
+Future<set<uint64_t> > Replica::missing(uint64_t from, uint64_t to) const
{
- return process::dispatch(process, &ReplicaProcess::missing, position);
+ return dispatch(process, &ReplicaProcess::missing, from, to);
}
-process::Future<uint64_t> Replica::beginning()
+Future<uint64_t> Replica::beginning() const
{
- return process::dispatch(process, &ReplicaProcess::beginning);
+ return dispatch(process, &ReplicaProcess::beginning);
}
-process::Future<uint64_t> Replica::ending()
+Future<uint64_t> Replica::ending() const
{
- return process::dispatch(process, &ReplicaProcess::ending);
+ return dispatch(process, &ReplicaProcess::ending);
}
-process::Future<uint64_t> Replica::promised()
+Future<uint64_t> Replica::promised() const
{
- return process::dispatch(process, &ReplicaProcess::promised);
+ return dispatch(process, &ReplicaProcess::promised);
}
-process::PID<ReplicaProcess> Replica::pid()
+PID<ReplicaProcess> Replica::pid() const
{
return process->self();
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/19ad88b7/src/log/replica.hpp
----------------------------------------------------------------------
diff --git a/src/log/replica.hpp b/src/log/replica.hpp
index d1f5ead..4cc7031 100644
--- a/src/log/replica.hpp
+++ b/src/log/replica.hpp
@@ -23,11 +23,9 @@
#include <set>
#include <string>
-#include <process/process.hpp>
#include <process/protobuf.hpp>
#include <stout/result.hpp>
-#include <stout/try.hpp>
#include "messages/log.hpp"
@@ -40,7 +38,6 @@ namespace protocol {
// Some replica protocol declarations.
extern Protocol<PromiseRequest, PromiseResponse> promise;
extern Protocol<WriteRequest, WriteResponse> write;
-extern Protocol<LearnRequest, LearnResponse> learn;
} // namespace protocol {
@@ -59,23 +56,31 @@ public:
// Returns all the actions between the specified positions, unless
// those positions are invalid, in which case returns an error.
- process::Future<std::list<Action> > read(uint64_t from, uint64_t to);
+ process::Future<std::list<Action> > read(
+ uint64_t from,
+ uint64_t to) const;
+
+ // Returns true if the specified position is missing in the log
+ // (i.e., unlearned or holes).
+ process::Future<bool> missing(uint64_t position) const;
// Returns missing positions in the log (i.e., unlearned or holes)
- // up to the specified position.
- process::Future<std::set<uint64_t> > missing(uint64_t position);
+ // within the specified range [from, to].
+ process::Future<std::set<uint64_t> > missing(
+ uint64_t from,
+ uint64_t to) const;
// Returns the beginning position of the log.
- process::Future<uint64_t> beginning();
+ process::Future<uint64_t> beginning() const;
// Returns the last written position in the log.
- process::Future<uint64_t> ending();
+ process::Future<uint64_t> ending() const;
// Returns the highest implicit promise this replica has given.
- process::Future<uint64_t> promised();
+ process::Future<uint64_t> promised() const;
// Returns the PID associated with this replica.
- process::PID<ReplicaProcess> pid();
+ process::PID<ReplicaProcess> pid() const;
private:
ReplicaProcess* process;
http://git-wip-us.apache.org/repos/asf/mesos/blob/19ad88b7/src/messages/log.proto
----------------------------------------------------------------------
diff --git a/src/messages/log.proto b/src/messages/log.proto
index e6460ab..8fa46ca 100644
--- a/src/messages/log.proto
+++ b/src/messages/log.proto
@@ -19,28 +19,28 @@
package mesos.internal.log;
-// Represents a "promise" that a replica has made to a coordinator. A
-// promise is *implicitly* valid for _all_ future actions that get
-// performed on the replicated log (provided the action comes from the
-// same coordinator), until a new promise is made to a coordinator
-// with a higher id. Each replica writesevery promise it makes as a
+// Represents a "promise" that a replica has made. A promise is
+// *implicitly* valid for _all_ future actions that get performed on
+// the replicated log (provided the action comes from the same
+// proposer), until a new promise is made to a proposer with a higher
+// proposal number. Each replica writes every promise it makes as a
// log record so that it can recover this information after a failure.
// TODO(benh): Does the promise actually need to be written to stable
// storage? Can we get away with looking at the last written action
// and using it's promised value? In this case, what happens if we
// make a promise but don't receive an action from that coordinator?
message Promise {
- required uint64 id = 1;
+ required uint64 proposal = 1;
}
// Represents an "action" performed on the log. Each action has an
// associated position in the log. In addition, each action (i.e.,
-// position) will have been "promised" to a specific coordinator
+// position) will have been "promised" to a specific proposer
// (implicitly or explicitly) and may have been "performed" from a
-// specific coordinator. An action may also be "learned" to have
-// reached consensus. There are three types of possible actions that
-// can be performed on the log: nop (no action), append, and truncate.
+// specific proposer. An action may also be "learned" to have reached
+// consensus. There are three types of possible actions that can be
+// performed on the log: nop (no action), append, and truncate.
message Action {
required uint64 position = 1;
required uint64 promised = 2;
@@ -110,32 +110,35 @@ message Record {
////////////////////////////////////////////////////
-// Represents a "promise" request from a coordinator with the
-// specified id to a replica. Most such requests will occur after a
-// coordinator has failed and a new coordinator is elected. In such a
-// case the position that the coordinator is asking the replica to
-// promise is implicitly *all* positions that the replica has made no
-// promises (thus the position field is not be used). In other
-// instances, however, a coordinator might be explicitly trying to
-// request that a replica promise a specific position in the log (such
-// as when trying to fill holes discovered during a client read), and
-// then position will be present.
+// Represents a "promise" request from a proposer with the specified
+// 'proposal' to a replica. If the proposer is a coordinator, most
+// such requests will occur after a coordinator has failed and a new
+// coordinator is elected. In such a case, the position that the
+// coordinator is asking the replica to promise is implicitly *all*
+// positions that the replica has made no promises (thus the position
+// field is not be used). In other instances, however, a proposer
+// might be explicitly trying to request that a replica promise a
+// specific position in the log (such as when trying to fill holes
+// discovered during a client read), and then the 'position' field
+// will be present.
message PromiseRequest {
- required uint64 id = 1;
+ required uint64 proposal = 1;
optional uint64 position = 2;
}
-// Represents a "promise" response from a replica back to a
-// coordinator with the specified id. A replica represents a 'nack'
-// (because it has promised a coordinator with a higher id) by setting
-// the okay field to false. The replica either sends back the highest
-// position it has recorded in the log (using the position field) or
-// the specific action (if any) it has at the position requested in
-// PromiseRequest.
+// Represents a "promise" response from a replica back to a proposer.
+// A replica represents a NACK (because it has promised a proposer
+// with a higher proposal number) by setting the okay field to false.
+// The 'proposal' is either the aforementioned higher proposal number
+// when the response is a NACK, or the corresponding request's
+// proposal number if it is an ACK. The replica either sends back the
+// highest position it has recorded in the log (using the 'position'
+// field) or the specific action (if any) it has at the position
+// requested in PromiseRequest (using the 'action' field).
message PromiseResponse {
required bool okay = 1;
- required uint64 id = 2;
+ required uint64 proposal = 2;
optional uint64 position = 4;
optional Action action = 3;
}
@@ -146,7 +149,7 @@ message PromiseResponse {
// fields that are not relevant to a write request (e.g., promised,
// performed) and rather than ignore them we exclude them for safety.
message WriteRequest {
- required uint64 id = 1;
+ required uint64 proposal = 1;
required uint64 position = 2;
optional bool learned = 3;
required Action.Type type = 4;
@@ -156,31 +159,21 @@ message WriteRequest {
}
-// Represents a write response corresponding to a write request. If
-// okay is not true then the coordinator has been demoted. Both id and
-// position should always correspond to the id and position set in the
-// request.
+// Represents a write response corresponding to a write request. A
+// replica represents a NACK (because it has promised a proposer with
+// a higher proposal number) by setting the okay field to false. If
+// the proposer is a coordinator, then it has been demoted. The
+// 'position' should always correspond to the position set in the
+// request. The 'proposal' is either the same proposal number set in
+// the request in the case of an ACK, or the higher proposal number
+// this position has been promised to in the case of a NACK.
message WriteResponse {
required bool okay = 1;
- required uint64 id = 2;
+ required uint64 proposal = 2;
required uint64 position = 3;
}
-// Represents a learn (i.e., read) request and response. Note that a
-// non-learned position will not be returned. TODO(benh): Allow
-// learning more than one position at a time.
-message LearnRequest {
- required uint64 position = 1;
-}
-
-
-message LearnResponse {
- required bool okay = 1;
- optional Action action = 2;
-}
-
-
// Represents a "learned" event, that is, when a particular action has
// been agreed upon (reached consensus).
message LearnedMessage {
http://git-wip-us.apache.org/repos/asf/mesos/blob/19ad88b7/src/tests/log_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/log_tests.cpp b/src/tests/log_tests.cpp
index ff5f86c..fb9bbd8 100644
--- a/src/tests/log_tests.cpp
+++ b/src/tests/log_tests.cpp
@@ -48,8 +48,13 @@ using namespace mesos::internal::log;
using process::Clock;
using process::Future;
using process::Timeout;
+using process::Shared;
using process::UPID;
+using std::list;
+using std::set;
+using std::string;
+
using testing::_;
using testing::Eq;
using testing::Return;
@@ -63,7 +68,7 @@ class ReplicaTest : public TemporaryDirectoryTest {};
TEST_F(ReplicaTest, Promise)
{
- const std::string path = os::getcwd() + "/.log";
+ const string path = os::getcwd() + "/.log";
Replica replica(path);
@@ -71,7 +76,7 @@ TEST_F(ReplicaTest, Promise)
PromiseResponse response;
Future<PromiseResponse> future;
- request.set_id(2);
+ request.set_proposal(2);
future = protocol::promise(replica.pid(), request);
@@ -79,12 +84,12 @@ TEST_F(ReplicaTest, Promise)
response = future.get();
EXPECT_TRUE(response.okay());
- EXPECT_EQ(2u, response.id());
+ EXPECT_EQ(2u, response.proposal());
EXPECT_TRUE(response.has_position());
EXPECT_EQ(0u, response.position());
EXPECT_FALSE(response.has_action());
- request.set_id(1);
+ request.set_proposal(1);
future = protocol::promise(replica.pid(), request);
@@ -92,11 +97,11 @@ TEST_F(ReplicaTest, Promise)
response = future.get();
EXPECT_FALSE(response.okay());
- EXPECT_EQ(1u, response.id());
+ EXPECT_EQ(2u, response.proposal()); // Highest proposal seen so far.
EXPECT_FALSE(response.has_position());
EXPECT_FALSE(response.has_action());
- request.set_id(3);
+ request.set_proposal(3);
future = protocol::promise(replica.pid(), request);
@@ -104,7 +109,7 @@ TEST_F(ReplicaTest, Promise)
response = future.get();
EXPECT_TRUE(response.okay());
- EXPECT_EQ(3u, response.id());
+ EXPECT_EQ(3u, response.proposal());
EXPECT_TRUE(response.has_position());
EXPECT_EQ(0u, response.position());
EXPECT_FALSE(response.has_action());
@@ -113,14 +118,14 @@ TEST_F(ReplicaTest, Promise)
TEST_F(ReplicaTest, Append)
{
- const std::string path = os::getcwd() + "/.log";
+ const string path = os::getcwd() + "/.log";
Replica replica(path);
- const uint64_t id = 1;
+ const uint64_t proposal = 1;
PromiseRequest request1;
- request1.set_id(id);
+ request1.set_proposal(proposal);
Future<PromiseResponse> future1 =
protocol::promise(replica.pid(), request1);
@@ -129,13 +134,13 @@ TEST_F(ReplicaTest, Append)
PromiseResponse response1 = future1.get();
EXPECT_TRUE(response1.okay());
- EXPECT_EQ(id, response1.id());
+ EXPECT_EQ(proposal, response1.proposal());
EXPECT_TRUE(response1.has_position());
EXPECT_EQ(0u, response1.position());
EXPECT_FALSE(response1.has_action());
WriteRequest request2;
- request2.set_id(id);
+ request2.set_proposal(proposal);
request2.set_position(1);
request2.set_type(Action::APPEND);
request2.mutable_append()->set_bytes("hello world");
@@ -147,10 +152,10 @@ TEST_F(ReplicaTest, Append)
WriteResponse response2 = future2.get();
EXPECT_TRUE(response2.okay());
- EXPECT_EQ(id, response2.id());
+ EXPECT_EQ(proposal, response2.proposal());
EXPECT_EQ(1u, response2.position());
- Future<std::list<Action> > actions = replica.read(1, 1);
+ Future<list<Action> > actions = replica.read(1, 1);
AWAIT_READY(actions);
ASSERT_EQ(1u, actions.get().size());
@@ -172,14 +177,14 @@ TEST_F(ReplicaTest, Append)
TEST_F(ReplicaTest, Recover)
{
- const std::string path = os::getcwd() + "/.log";
+ const string path = os::getcwd() + "/.log";
Replica replica1(path);
- const uint64_t id = 1;
+ const uint64_t proposal= 1;
PromiseRequest request1;
- request1.set_id(id);
+ request1.set_proposal(proposal);
Future<PromiseResponse> future1 =
protocol::promise(replica1.pid(), request1);
@@ -188,13 +193,13 @@ TEST_F(ReplicaTest, Recover)
PromiseResponse response1 = future1.get();
EXPECT_TRUE(response1.okay());
- EXPECT_EQ(id, response1.id());
+ EXPECT_EQ(proposal, response1.proposal());
EXPECT_TRUE(response1.has_position());
EXPECT_EQ(0u, response1.position());
EXPECT_FALSE(response1.has_action());
WriteRequest request2;
- request2.set_id(id);
+ request2.set_proposal(proposal);
request2.set_position(1);
request2.set_type(Action::APPEND);
request2.mutable_append()->set_bytes("hello world");
@@ -206,10 +211,10 @@ TEST_F(ReplicaTest, Recover)
WriteResponse response2 = future2.get();
EXPECT_TRUE(response2.okay());
- EXPECT_EQ(id, response2.id());
+ EXPECT_EQ(proposal, response2.proposal());
EXPECT_EQ(1u, response2.position());
- Future<std::list<Action> > actions1 = replica1.read(1, 1);
+ Future<list<Action> > actions1 = replica1.read(1, 1);
AWAIT_READY(actions1);
ASSERT_EQ(1u, actions1.get().size());
@@ -231,7 +236,7 @@ TEST_F(ReplicaTest, Recover)
Replica replica2(path);
- Future<std::list<Action> > actions2 = replica2.read(1, 1);
+ Future<list<Action> > actions2 = replica2.read(1, 1);
AWAIT_READY(actions2);
ASSERT_EQ(1u, actions2.get().size());
@@ -258,18 +263,19 @@ class CoordinatorTest : public TemporaryDirectoryTest {};
TEST_F(CoordinatorTest, Elect)
{
- const std::string path1 = os::getcwd() + "/.log1";
- const std::string path2 = os::getcwd() + "/.log2";
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
- Replica replica1(path1);
- Replica replica2(path2);
+ Shared<Replica> replica1(new Replica(path1));
+ Shared<Replica> replica2(new Replica(path2));
- Network network;
+ set<UPID> pids;
+ pids.insert(replica1->pid());
+ pids.insert(replica2->pid());
- network.add(replica1.pid());
- network.add(replica2.pid());
+ Shared<Network> network(new Network(pids));
- Coordinator coord(2, &replica1, &network);
+ Coordinator coord(2, replica1, network);
{
Result<uint64_t> result = coord.elect(Timeout::in(Seconds(10)));
@@ -278,7 +284,7 @@ TEST_F(CoordinatorTest, Elect)
}
{
- Future<std::list<Action> > actions = replica1.read(0, 0);
+ Future<list<Action> > actions = replica1->read(0, 0);
AWAIT_READY(actions);
ASSERT_EQ(1u, actions.get().size());
EXPECT_EQ(0u, actions.get().front().position());
@@ -290,18 +296,19 @@ TEST_F(CoordinatorTest, Elect)
TEST_F(CoordinatorTest, AppendRead)
{
- const std::string path1 = os::getcwd() + "/.log1";
- const std::string path2 = os::getcwd() + "/.log2";
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
- Replica replica1(path1);
- Replica replica2(path2);
+ Shared<Replica> replica1(new Replica(path1));
+ Shared<Replica> replica2(new Replica(path2));
- Network network;
+ set<UPID> pids;
+ pids.insert(replica1->pid());
+ pids.insert(replica2->pid());
- network.add(replica1.pid());
- network.add(replica2.pid());
+ Shared<Network> network(new Network(pids));
- Coordinator coord(2, &replica1, &network);
+ Coordinator coord(2, replica1, network);
{
Result<uint64_t> result = coord.elect(Timeout::in(Seconds(10)));
@@ -320,7 +327,7 @@ TEST_F(CoordinatorTest, AppendRead)
}
{
- Future<std::list<Action> > actions = replica1.read(position, position);
+ Future<list<Action> > actions = replica1->read(position, position);
AWAIT_READY(actions);
ASSERT_EQ(1u, actions.get().size());
EXPECT_EQ(position, actions.get().front().position());
@@ -333,18 +340,19 @@ TEST_F(CoordinatorTest, AppendRead)
TEST_F(CoordinatorTest, AppendReadError)
{
- const std::string path1 = os::getcwd() + "/.log1";
- const std::string path2 = os::getcwd() + "/.log2";
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
- Replica replica1(path1);
- Replica replica2(path2);
+ Shared<Replica> replica1(new Replica(path1));
+ Shared<Replica> replica2(new Replica(path2));
- Network network;
+ set<UPID> pids;
+ pids.insert(replica1->pid());
+ pids.insert(replica2->pid());
- network.add(replica1.pid());
- network.add(replica2.pid());
+ Shared<Network> network(new Network(pids));
- Coordinator coord(2, &replica1, &network);
+ Coordinator coord(2, replica1, network);
{
Result<uint64_t> result = coord.elect(Timeout::in(Seconds(10)));
@@ -364,7 +372,7 @@ TEST_F(CoordinatorTest, AppendReadError)
{
position += 1;
- Future<std::list<Action> > actions = replica1.read(position, position);
+ Future<list<Action> > actions = replica1->read(position, position);
AWAIT_FAILED(actions);
EXPECT_EQ("Bad read range (past end of log)", actions.failure());
}
@@ -373,15 +381,16 @@ TEST_F(CoordinatorTest, AppendReadError)
TEST_F(CoordinatorTest, ElectNoQuorum)
{
- const std::string path = os::getcwd() + "/.log";
+ const string path = os::getcwd() + "/.log";
- Replica replica(path);
+ Shared<Replica> replica(new Replica(path));
- Network network;
+ set<UPID> pids;
+ pids.insert(replica->pid());
- network.add(replica.pid());
+ Shared<Network> network(new Network(pids));
- Coordinator coord(2, &replica, &network);
+ Coordinator coord(2, replica, network);
Clock::pause();
@@ -401,18 +410,19 @@ TEST_F(CoordinatorTest, ElectNoQuorum)
TEST_F(CoordinatorTest, AppendNoQuorum)
{
- const std::string path1 = os::getcwd() + "/.log1";
- const std::string path2 = os::getcwd() + "/.log2";
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
- Replica replica1(path1);
- Replica replica2(path2);
+ Shared<Replica> replica1(new Replica(path1));
+ Shared<Replica> replica2(new Replica(path2));
- Network network;
+ set<UPID> pids;
+ pids.insert(replica1->pid());
+ pids.insert(replica2->pid());
- network.add(replica1.pid());
- network.add(replica2.pid());
+ Shared<Network> network(new Network(pids));
- Coordinator coord(2, &replica1, &network);
+ Coordinator coord(2, replica1, network);
{
Result<uint64_t> result = coord.elect(Timeout::in(Seconds(10)));
@@ -420,7 +430,9 @@ TEST_F(CoordinatorTest, AppendNoQuorum)
EXPECT_EQ(0u, result.get());
}
- network.remove(replica2.pid());
+ process::terminate(replica2->pid());
+ process::wait(replica2->pid());
+ replica2.reset();
Clock::pause();
@@ -440,18 +452,19 @@ TEST_F(CoordinatorTest, AppendNoQuorum)
TEST_F(CoordinatorTest, Failover)
{
- const std::string path1 = os::getcwd() + "/.log1";
- const std::string path2 = os::getcwd() + "/.log2";
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
- Replica replica1(path1);
- Replica replica2(path2);
+ Shared<Replica> replica1(new Replica(path1));
+ Shared<Replica> replica2(new Replica(path2));
- Network network1;
+ set<UPID> pids;
+ pids.insert(replica1->pid());
+ pids.insert(replica2->pid());
- network1.add(replica1.pid());
- network1.add(replica2.pid());
+ Shared<Network> network1(new Network(pids));
- Coordinator coord1(2, &replica1, &network1);
+ Coordinator coord1(2, replica1, network1);
{
Result<uint64_t> result = coord1.elect(Timeout::in(Seconds(10)));
@@ -469,12 +482,9 @@ TEST_F(CoordinatorTest, Failover)
EXPECT_EQ(1u, position);
}
- Network network2;
+ Shared<Network> network2(new Network(pids));
- network2.add(replica1.pid());
- network2.add(replica2.pid());
-
- Coordinator coord2(2, &replica2, &network2);
+ Coordinator coord2(2, replica2, network2);
{
Result<uint64_t> result = coord2.elect(Timeout::in(Seconds(10)));
@@ -483,7 +493,7 @@ TEST_F(CoordinatorTest, Failover)
}
{
- Future<std::list<Action> > actions = replica2.read(position, position);
+ Future<list<Action> > actions = replica2->read(position, position);
AWAIT_READY(actions);
ASSERT_EQ(1u, actions.get().size());
EXPECT_EQ(position, actions.get().front().position());
@@ -496,18 +506,19 @@ TEST_F(CoordinatorTest, Failover)
TEST_F(CoordinatorTest, Demoted)
{
- const std::string path1 = os::getcwd() + "/.log1";
- const std::string path2 = os::getcwd() + "/.log2";
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
- Replica replica1(path1);
- Replica replica2(path2);
+ Shared<Replica> replica1(new Replica(path1));
+ Shared<Replica> replica2(new Replica(path2));
- Network network1;
+ set<UPID> pids;
+ pids.insert(replica1->pid());
+ pids.insert(replica2->pid());
- network1.add(replica1.pid());
- network1.add(replica2.pid());
+ Shared<Network> network1(new Network(pids));
- Coordinator coord1(2, &replica1, &network1);
+ Coordinator coord1(2, replica1, network1);
{
Result<uint64_t> result = coord1.elect(Timeout::in(Seconds(10)));
@@ -525,12 +536,9 @@ TEST_F(CoordinatorTest, Demoted)
EXPECT_EQ(1u, position);
}
- Network network2;
+ Shared<Network> network2(new Network(pids));
- network2.add(replica1.pid());
- network2.add(replica2.pid());
-
- Coordinator coord2(2, &replica2, &network2);
+ Coordinator coord2(2, replica2, network2);
{
Result<uint64_t> result = coord2.elect(Timeout::in(Seconds(10)));
@@ -554,7 +562,7 @@ TEST_F(CoordinatorTest, Demoted)
}
{
- Future<std::list<Action> > actions = replica2.read(position, position);
+ Future<list<Action> > actions = replica2->read(position, position);
AWAIT_READY(actions);
ASSERT_EQ(1u, actions.get().size());
EXPECT_EQ(position, actions.get().front().position());
@@ -567,19 +575,20 @@ TEST_F(CoordinatorTest, Demoted)
TEST_F(CoordinatorTest, Fill)
{
- const std::string path1 = os::getcwd() + "/.log1";
- const std::string path2 = os::getcwd() + "/.log2";
- const std::string path3 = os::getcwd() + "/.log3";
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
+ const string path3 = os::getcwd() + "/.log3";
- Replica replica1(path1);
- Replica replica2(path2);
+ Shared<Replica> replica1(new Replica(path1));
+ Shared<Replica> replica2(new Replica(path2));
- Network network1;
+ set<UPID> pids;
+ pids.insert(replica1->pid());
+ pids.insert(replica2->pid());
- network1.add(replica1.pid());
- network1.add(replica2.pid());
+ Shared<Network> network1(new Network(pids));
- Coordinator coord1(2, &replica1, &network1);
+ Coordinator coord1(2, replica1, network1);
{
Result<uint64_t> result = coord1.elect(Timeout::in(Seconds(10)));
@@ -597,14 +606,15 @@ TEST_F(CoordinatorTest, Fill)
EXPECT_EQ(1u, position);
}
- Replica replica3(path3);
+ Shared<Replica> replica3(new Replica(path3));
- Network network2;
+ pids.clear();
+ pids.insert(replica2->pid());
+ pids.insert(replica3->pid());
- network2.add(replica2.pid());
- network2.add(replica3.pid());
+ Shared<Network> network2(new Network(pids));
- Coordinator coord2(2, &replica3, &network2);
+ Coordinator coord2(2, replica3, network2);
{
Result<uint64_t> result = coord2.elect(Timeout::in(Seconds(10)));
@@ -615,7 +625,7 @@ TEST_F(CoordinatorTest, Fill)
}
{
- Future<std::list<Action> > actions = replica3.read(position, position);
+ Future<list<Action> > actions = replica3->read(position, position);
AWAIT_READY(actions);
ASSERT_EQ(1u, actions.get().size());
EXPECT_EQ(position, actions.get().front().position());
@@ -628,21 +638,24 @@ TEST_F(CoordinatorTest, Fill)
TEST_F(CoordinatorTest, NotLearnedFill)
{
- DROP_MESSAGES(Eq(LearnedMessage().GetTypeName()), _, _);
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
+ const string path3 = os::getcwd() + "/.log3";
- const std::string path1 = os::getcwd() + "/.log1";
- const std::string path2 = os::getcwd() + "/.log2";
- const std::string path3 = os::getcwd() + "/.log3";
+ Shared<Replica> replica1(new Replica(path1));
+ Shared<Replica> replica2(new Replica(path2));
- Replica replica1(path1);
- Replica replica2(path2);
+ // Drop messages here in order to obtain the pid of replica2. We
+ // only want to drop learned message sent to replica2.
+ DROP_MESSAGES(Eq(LearnedMessage().GetTypeName()), _, Eq(replica2->pid()));
- Network network1;
+ set<UPID> pids;
+ pids.insert(replica1->pid());
+ pids.insert(replica2->pid());
- network1.add(replica1.pid());
- network1.add(replica2.pid());
+ Shared<Network> network1(new Network(pids));
- Coordinator coord1(2, &replica1, &network1);
+ Coordinator coord1(2, replica1, network1);
{
Result<uint64_t> result = coord1.elect(Timeout::in(Seconds(10)));
@@ -660,14 +673,15 @@ TEST_F(CoordinatorTest, NotLearnedFill)
EXPECT_EQ(1u, position);
}
- Replica replica3(path3);
+ Shared<Replica> replica3(new Replica(path3));
- Network network2;
+ pids.clear();
+ pids.insert(replica2->pid());
+ pids.insert(replica3->pid());
- network2.add(replica2.pid());
- network2.add(replica3.pid());
+ Shared<Network> network2(new Network(pids));
- Coordinator coord2(2, &replica3, &network2);
+ Coordinator coord2(2, replica3, network2);
{
Result<uint64_t> result = coord2.elect(Timeout::in(Seconds(10)));
@@ -678,7 +692,7 @@ TEST_F(CoordinatorTest, NotLearnedFill)
}
{
- Future<std::list<Action> > actions = replica3.read(position, position);
+ Future<list<Action> > actions = replica3->read(position, position);
AWAIT_READY(actions);
ASSERT_EQ(1u, actions.get().size());
EXPECT_EQ(position, actions.get().front().position());
@@ -691,18 +705,19 @@ TEST_F(CoordinatorTest, NotLearnedFill)
TEST_F(CoordinatorTest, MultipleAppends)
{
- const std::string path1 = os::getcwd() + "/.log1";
- const std::string path2 = os::getcwd() + "/.log2";
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
- Replica replica1(path1);
- Replica replica2(path2);
+ Shared<Replica> replica1(new Replica(path1));
+ Shared<Replica> replica2(new Replica(path2));
- Network network;
+ set<UPID> pids;
+ pids.insert(replica1->pid());
+ pids.insert(replica2->pid());
- network.add(replica1.pid());
- network.add(replica2.pid());
+ Shared<Network> network(new Network(pids));
- Coordinator coord(2, &replica1, &network);
+ Coordinator coord(2, replica1, network);
{
Result<uint64_t> result = coord.elect(Timeout::in(Seconds(10)));
@@ -718,7 +733,7 @@ TEST_F(CoordinatorTest, MultipleAppends)
}
{
- Future<std::list<Action> > actions = replica1.read(1, 10);
+ Future<list<Action> > actions = replica1->read(1, 10);
AWAIT_READY(actions);
EXPECT_EQ(10u, actions.get().size());
foreach (const Action& action, actions.get()) {
@@ -732,21 +747,24 @@ TEST_F(CoordinatorTest, MultipleAppends)
TEST_F(CoordinatorTest, MultipleAppendsNotLearnedFill)
{
- DROP_MESSAGES(Eq(LearnedMessage().GetTypeName()), _, _);
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
+ const string path3 = os::getcwd() + "/.log3";
- const std::string path1 = os::getcwd() + "/.log1";
- const std::string path2 = os::getcwd() + "/.log2";
- const std::string path3 = os::getcwd() + "/.log3";
+ Shared<Replica> replica1(new Replica(path1));
+ Shared<Replica> replica2(new Replica(path2));
- Replica replica1(path1);
- Replica replica2(path2);
+ // Drop messages here in order to obtain the pid of replica2. We
+ // only want to drop learned message sent to replica2.
+ DROP_MESSAGES(Eq(LearnedMessage().GetTypeName()), _, Eq(replica2->pid()));
- Network network1;
+ set<UPID> pids;
+ pids.insert(replica1->pid());
+ pids.insert(replica2->pid());
- network1.add(replica1.pid());
- network1.add(replica2.pid());
+ Shared<Network> network1(new Network(pids));
- Coordinator coord1(2, &replica1, &network1);
+ Coordinator coord1(2, replica1, network1);
{
Result<uint64_t> result = coord1.elect(Timeout::in(Seconds(10)));
@@ -761,14 +779,15 @@ TEST_F(CoordinatorTest, MultipleAppendsNotLearnedFill)
EXPECT_EQ(position, result.get());
}
- Replica replica3(path3);
+ Shared<Replica> replica3(new Replica(path3));
- Network network2;
+ pids.clear();
+ pids.insert(replica2->pid());
+ pids.insert(replica3->pid());
- network2.add(replica2.pid());
- network2.add(replica3.pid());
+ Shared<Network> network2(new Network(pids));
- Coordinator coord2(2, &replica3, &network2);
+ Coordinator coord2(2, replica3, network2);
{
Result<uint64_t> result = coord2.elect(Timeout::in(Seconds(10)));
@@ -779,7 +798,7 @@ TEST_F(CoordinatorTest, MultipleAppendsNotLearnedFill)
}
{
- Future<std::list<Action> > actions = replica3.read(1, 10);
+ Future<list<Action> > actions = replica3->read(1, 10);
AWAIT_READY(actions);
EXPECT_EQ(10u, actions.get().size());
foreach (const Action& action, actions.get()) {
@@ -793,18 +812,19 @@ TEST_F(CoordinatorTest, MultipleAppendsNotLearnedFill)
TEST_F(CoordinatorTest, Truncate)
{
- const std::string path1 = os::getcwd() + "/.log1";
- const std::string path2 = os::getcwd() + "/.log2";
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
- Replica replica1(path1);
- Replica replica2(path2);
+ Shared<Replica> replica1(new Replica(path1));
+ Shared<Replica> replica2(new Replica(path2));
- Network network;
+ set<UPID> pids;
+ pids.insert(replica1->pid());
+ pids.insert(replica2->pid());
- network.add(replica1.pid());
- network.add(replica2.pid());
+ Shared<Network> network(new Network(pids));
- Coordinator coord(2, &replica1, &network);
+ Coordinator coord(2, replica1, network);
{
Result<uint64_t> result = coord.elect(Timeout::in(Seconds(10)));
@@ -826,13 +846,13 @@ TEST_F(CoordinatorTest, Truncate)
}
{
- Future<std::list<Action> > actions = replica1.read(6, 10);
+ Future<list<Action> > actions = replica1->read(6, 10);
AWAIT_FAILED(actions);
EXPECT_EQ("Bad read range (truncated position)", actions.failure());
}
{
- Future<std::list<Action> > actions = replica1.read(7, 10);
+ Future<list<Action> > actions = replica1->read(7, 10);
AWAIT_READY(actions);
EXPECT_EQ(4u, actions.get().size());
foreach (const Action& action, actions.get()) {
@@ -846,21 +866,24 @@ TEST_F(CoordinatorTest, Truncate)
TEST_F(CoordinatorTest, TruncateNotLearnedFill)
{
- DROP_MESSAGES(Eq(LearnedMessage().GetTypeName()), _, _);
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
+ const string path3 = os::getcwd() + "/.log3";
- const std::string path1 = os::getcwd() + "/.log1";
- const std::string path2 = os::getcwd() + "/.log2";
- const std::string path3 = os::getcwd() + "/.log3";
+ Shared<Replica> replica1(new Replica(path1));
+ Shared<Replica> replica2(new Replica(path2));
- Replica replica1(path1);
- Replica replica2(path2);
+ // Drop messages here in order to obtain the pid of replica2. We
+ // only want to drop learned message sent to replica2.
+ DROP_MESSAGES(Eq(LearnedMessage().GetTypeName()), _, Eq(replica2->pid()));
- Network network1;
+ set<UPID> pids;
+ pids.insert(replica1->pid());
+ pids.insert(replica2->pid());
- network1.add(replica1.pid());
- network1.add(replica2.pid());
+ Shared<Network> network1(new Network(pids));
- Coordinator coord1(2, &replica1, &network1);
+ Coordinator coord1(2, replica1, network1);
{
Result<uint64_t> result = coord1.elect(Timeout::in(Seconds(10)));
@@ -881,14 +904,15 @@ TEST_F(CoordinatorTest, TruncateNotLearnedFill)
EXPECT_EQ(11u, result.get());
}
- Replica replica3(path3);
+ Shared<Replica> replica3(new Replica(path3));
- Network network2;
+ pids.clear();
+ pids.insert(replica2->pid());
+ pids.insert(replica3->pid());
- network2.add(replica2.pid());
- network2.add(replica3.pid());
+ Shared<Network> network2(new Network(pids));
- Coordinator coord2(2, &replica3, &network2);
+ Coordinator coord2(2, replica3, network2);
{
Result<uint64_t> result = coord2.elect(Timeout::in(Seconds(10)));
@@ -899,13 +923,13 @@ TEST_F(CoordinatorTest, TruncateNotLearnedFill)
}
{
- Future<std::list<Action> > actions = replica3.read(6, 10);
+ Future<list<Action> > actions = replica3->read(6, 10);
AWAIT_FAILED(actions);
EXPECT_EQ("Bad read range (truncated position)", actions.failure());
}
{
- Future<std::list<Action> > actions = replica3.read(7, 10);
+ Future<list<Action> > actions = replica3->read(7, 10);
AWAIT_READY(actions);
EXPECT_EQ(4u, actions.get().size());
foreach (const Action& action, actions.get()) {
@@ -919,19 +943,20 @@ TEST_F(CoordinatorTest, TruncateNotLearnedFill)
TEST_F(CoordinatorTest, TruncateLearnedFill)
{
- const std::string path1 = os::getcwd() + "/.log1";
- const std::string path2 = os::getcwd() + "/.log2";
- const std::string path3 = os::getcwd() + "/.log3";
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
+ const string path3 = os::getcwd() + "/.log3";
- Replica replica1(path1);
- Replica replica2(path2);
+ Shared<Replica> replica1(new Replica(path1));
+ Shared<Replica> replica2(new Replica(path2));
- Network network1;
+ set<UPID> pids;
+ pids.insert(replica1->pid());
+ pids.insert(replica2->pid());
- network1.add(replica1.pid());
- network1.add(replica2.pid());
+ Shared<Network> network1(new Network(pids));
- Coordinator coord1(2, &replica1, &network1);
+ Coordinator coord1(2, replica1, network1);
{
Result<uint64_t> result = coord1.elect(Timeout::in(Seconds(10)));
@@ -952,14 +977,15 @@ TEST_F(CoordinatorTest, TruncateLearnedFill)
EXPECT_EQ(11u, result.get());
}
- Replica replica3(path3);
+ Shared<Replica> replica3(new Replica(path3));
- Network network2;
+ pids.clear();
+ pids.insert(replica2->pid());
+ pids.insert(replica3->pid());
- network2.add(replica2.pid());
- network2.add(replica3.pid());
+ Shared<Network> network2(new Network(pids));
- Coordinator coord2(2, &replica3, &network2);
+ Coordinator coord2(2, replica3, network2);
{
Result<uint64_t> result = coord2.elect(Timeout::in(Seconds(10)));
@@ -970,13 +996,13 @@ TEST_F(CoordinatorTest, TruncateLearnedFill)
}
{
- Future<std::list<Action> > actions = replica3.read(6, 10);
+ Future<list<Action> > actions = replica3->read(6, 10);
AWAIT_FAILED(actions);
EXPECT_EQ("Bad read range (truncated position)", actions.failure());
}
{
- Future<std::list<Action> > actions = replica3.read(7, 10);
+ Future<list<Action> > actions = replica3->read(7, 10);
AWAIT_READY(actions);
EXPECT_EQ(4u, actions.get().size());
foreach (const Action& action, actions.get()) {
@@ -993,12 +1019,12 @@ class LogTest : public TemporaryDirectoryTest {};
TEST_F(LogTest, WriteRead)
{
- const std::string path1 = os::getcwd() + "/.log1";
- const std::string path2 = os::getcwd() + "/.log2";
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
Replica replica1(path1);
- std::set<UPID> pids;
+ set<UPID> pids;
pids.insert(replica1.pid());
Log log(2, path2, pids);
@@ -1012,7 +1038,7 @@ TEST_F(LogTest, WriteRead)
Log::Reader reader(&log);
- Result<std::list<Log::Entry> > entries =
+ Result<list<Log::Entry> > entries =
reader.read(position.get(), position.get(), Timeout::in(Seconds(10)));
ASSERT_SOME(entries);
@@ -1024,12 +1050,12 @@ TEST_F(LogTest, WriteRead)
TEST_F(LogTest, Position)
{
- const std::string path1 = os::getcwd() + "/.log1";
- const std::string path2 = os::getcwd() + "/.log2";
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
Replica replica1(path1);
- std::set<UPID> pids;
+ set<UPID> pids;
pids.insert(replica1.pid());
Log log(2, path2, pids);
[05/10] git commit: Added log recovery support.
Posted by be...@apache.org.
Added log recovery support.
From: Jie Yu <yu...@gmail.com>
Review: https://reviews.apache.org/r/15802
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f9b60c4c
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f9b60c4c
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f9b60c4c
Branch: refs/heads/master
Commit: f9b60c4cac503b2d7a1c0cb5403203e619e563e6
Parents: 6ea7c14
Author: Benjamin Hindman <be...@gmail.com>
Authored: Thu Jan 16 16:53:40 2014 -0800
Committer: Benjamin Hindman <be...@gmail.com>
Committed: Thu Jan 16 16:53:40 2014 -0800
----------------------------------------------------------------------
src/Makefile.am | 3 +
src/common/type_utils.hpp | 25 +
src/log/coordinator.cpp | 121 +---
src/log/coordinator.hpp | 42 +-
src/log/log.cpp | 1192 ++++++++++++++++++++++++----------------
src/log/log.hpp | 283 ++--------
src/log/recover.cpp | 403 ++++++++++++++
src/log/recover.hpp | 59 ++
src/log/replica.cpp | 214 ++++++--
src/log/replica.hpp | 13 +-
src/messages/log.proto | 18 +-
src/tests/log_tests.cpp | 419 ++++++++------
12 files changed, 1727 insertions(+), 1065 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/f9b60c4c/src/Makefile.am
----------------------------------------------------------------------
diff --git a/src/Makefile.am b/src/Makefile.am
index 17fbf83..60fcb31 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -301,11 +301,14 @@ liblog_la_SOURCES = \
log/catchup.cpp \
log/consensus.cpp \
log/coordinator.cpp \
+ log/log.cpp \
+ log/recover.cpp \
log/replica.cpp
liblog_la_SOURCES += \
log/catchup.hpp \
log/consensus.hpp \
log/coordinator.hpp \
+ log/recover.hpp \
log/replica.hpp \
log/log.hpp \
log/network.hpp \
http://git-wip-us.apache.org/repos/asf/mesos/blob/f9b60c4c/src/common/type_utils.hpp
----------------------------------------------------------------------
diff --git a/src/common/type_utils.hpp b/src/common/type_utils.hpp
index 3b05751..fe6bf71 100644
--- a/src/common/type_utils.hpp
+++ b/src/common/type_utils.hpp
@@ -30,6 +30,7 @@
#include "common/attributes.hpp"
+#include "messages/log.hpp"
#include "messages/messages.hpp"
// This file includes definitions for operators on protobuf classes
@@ -371,4 +372,28 @@ inline std::ostream& operator << (
}} // namespace mesos { namespace internal {
+
+namespace mesos {
+namespace internal {
+namespace log {
+
+inline std::ostream& operator << (
+ std::ostream& stream,
+ const Action::Type& type)
+{
+ return stream << Action::Type_Name(type);
+}
+
+
+inline std::ostream& operator << (
+ std::ostream& stream,
+ const Metadata::Status& status)
+{
+ return stream << Metadata::Status_Name(status);
+}
+
+} // namespace log {
+} // namespace internal {
+} // namespace mesos
+
#endif // __TYPE_UTILS_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/f9b60c4c/src/log/coordinator.cpp
----------------------------------------------------------------------
diff --git a/src/log/coordinator.cpp b/src/log/coordinator.cpp
index 21f2865..bc85e66 100644
--- a/src/log/coordinator.cpp
+++ b/src/log/coordinator.cpp
@@ -20,13 +20,13 @@
#include <process/defer.hpp>
#include <process/dispatch.hpp>
-#include <process/future.hpp>
#include <process/id.hpp>
#include <process/process.hpp>
-#include <stout/error.hpp>
#include <stout/none.hpp>
+#include "common/type_utils.hpp"
+
#include "log/catchup.hpp"
#include "log/consensus.hpp"
#include "log/coordinator.hpp"
@@ -59,22 +59,10 @@ public:
virtual ~CoordinatorProcess() {}
- // Handles coordinator election. Returns the last committed log
- // position if the operation succeeds. Returns none if the election
- // is not successful, but can be retried.
+ // See comments in 'coordinator.hpp'.
Future<Option<uint64_t> > elect();
-
- // Handles coordinator demotion. Returns the last committed log
- // position if the operation succeeds.
Future<uint64_t> demote();
-
- // Appends the specified bytes to the end of the log. Returns the
- // position of the appended entry if the operation succeeds.
Future<uint64_t> append(const string& bytes);
-
- // Removes all log entries preceding the log entry at the given
- // position (to). Returns the position at which the truncate
- // operation is written if the operation succeeds.
Future<uint64_t> truncate(uint64_t to);
protected:
@@ -344,8 +332,7 @@ Future<uint64_t> CoordinatorProcess::truncate(uint64_t to)
Future<uint64_t> CoordinatorProcess::write(const Action& action)
{
- LOG(INFO) << "Coordinator attempting to write "
- << Action::Type_Name(action.type())
+ LOG(INFO) << "Coordinator attempting to write " << action.type()
<< " action at position " << action.position();
CHECK_EQ(state, ELECTED);
@@ -455,111 +442,27 @@ Coordinator::~Coordinator()
}
-Result<uint64_t> Coordinator::elect(const Timeout& timeout)
+Future<Option<uint64_t> > Coordinator::elect()
{
- LOG(INFO) << "Coordinator attempting to get elected within "
- << timeout.remaining();
-
- Future<Option<uint64_t> > electing =
- dispatch(process, &CoordinatorProcess::elect);
-
- electing.await(timeout.remaining());
-
- CHECK(!electing.isDiscarded());
-
- if (electing.isPending()) {
- LOG(INFO) << "Coordinator timed out while trying to get elected";
-
- electing.discard();
- return None();
- } else if (electing.isFailed()) {
- LOG(ERROR) << "Coordinator failed to get elected: "
- << electing.failure();
-
- return Error(electing.failure());
- } else {
- if (electing.get().isNone()) {
- LOG(INFO) << "Coordinator lost an election, but can be retried";
-
- return None();
- } else {
- LOG(INFO) << "Coordinator elected with current position "
- << electing.get().get();
-
- return electing.get().get();
- }
- }
+ return dispatch(process, &CoordinatorProcess::elect);
}
-Result<uint64_t> Coordinator::demote()
+Future<uint64_t> Coordinator::demote()
{
- Future<uint64_t> demoting =
- dispatch(process, &CoordinatorProcess::demote);
-
- demoting.await(); // TODO(jieyu): Use a timeout.
-
- CHECK(!demoting.isDiscarded());
-
- if (demoting.isFailed()) {
- return Error(demoting.failure());
- } else {
- return demoting.get();
- }
+ return dispatch(process, &CoordinatorProcess::demote);
}
-Result<uint64_t> Coordinator::append(
- const string& bytes,
- const Timeout& timeout)
+Future<uint64_t> Coordinator::append(const string& bytes)
{
- Future<uint64_t> appending =
- dispatch(process, &CoordinatorProcess::append, bytes);
-
- appending.await(timeout.remaining());
-
- CHECK(!appending.isDiscarded());
-
- if (appending.isPending()) {
- LOG(INFO) << "Coordinator timed out while trying to append";
-
- appending.discard();
- return None();
- } else if (appending.isFailed()) {
- LOG(ERROR) << "Coordinator failed to append the log: "
- << appending.failure();
-
- return Error(appending.failure());
- } else {
- return appending.get();
- }
+ return dispatch(process, &CoordinatorProcess::append, bytes);
}
-Result<uint64_t> Coordinator::truncate(
- uint64_t to,
- const Timeout& timeout)
+Future<uint64_t> Coordinator::truncate(uint64_t to)
{
- Future<uint64_t> truncating =
- dispatch(process, &CoordinatorProcess::truncate, to);
-
- truncating.await(timeout.remaining());
-
- CHECK(!truncating.isDiscarded());
-
- if (truncating.isPending()) {
- LOG(INFO) << "Coordinator timed out while trying to truncate";
-
- truncating.discard();
- return None();
- } else if (truncating.isFailed()) {
- LOG(ERROR) << "Coordinator failed to truncate the log: "
- << truncating.failure();
-
- return Error(truncating.failure());
- } else {
- return truncating.get();
- }
+ return dispatch(process, &CoordinatorProcess::truncate, to);
}
} // namespace log {
http://git-wip-us.apache.org/repos/asf/mesos/blob/f9b60c4c/src/log/coordinator.hpp
----------------------------------------------------------------------
diff --git a/src/log/coordinator.hpp b/src/log/coordinator.hpp
index 43cb530..35b68e9 100644
--- a/src/log/coordinator.hpp
+++ b/src/log/coordinator.hpp
@@ -23,10 +23,10 @@
#include <string>
+#include <process/future.hpp>
#include <process/shared.hpp>
-#include <process/timeout.hpp>
-#include <stout/result.hpp>
+#include <stout/option.hpp>
#include "log/network.hpp"
#include "log/replica.hpp"
@@ -49,25 +49,25 @@ public:
~Coordinator();
- // Handles coordinator election/demotion. A result of none means the
- // coordinator failed to achieve a quorum (e.g., due to timeout) but
- // can be retried. A some result returns the last committed log
- // position.
- Result<uint64_t> elect(const process::Timeout& timeout);
- Result<uint64_t> demote();
-
- // Returns the result of trying to append the specified bytes. A
- // result of none means the append failed (e.g., due to timeout),
- // but can be retried.
- Result<uint64_t> append(
- const std::string& bytes,
- const process::Timeout& timeout);
-
- // Returns the result of trying to truncate the log (from the
- // beginning to the specified position exclusive). A result of
- // none means the truncate failed (e.g., due to timeout), but can be
- // retried.
- Result<uint64_t> truncate(uint64_t to, const process::Timeout& timeout);
+ // Handles coordinator election. Returns the last committed (a.k.a.,
+ // learned) log position if the operation succeeds. Returns none if
+ // the election is not successful, but can be retried.
+ process::Future<Option<uint64_t> > elect();
+
+ // Handles coordinator demotion. Returns the last committed (a.k.a.,
+ // learned) log position if the operation succeeds. One should only
+ // call this function if the coordinator has been elected, and no
+ // write (append or truncate) is in progress.
+ process::Future<uint64_t> demote();
+
+ // Appends the specified bytes to the end of the log. Returns the
+ // position of the appended entry if the operation succeeds.
+ process::Future<uint64_t> append(const std::string& bytes);
+
+ // Removes all log entries preceding the log entry at the given
+ // position (to). Returns the position at which the truncate
+ // operation is written if the operation succeeds.
+ process::Future<uint64_t> truncate(uint64_t to);
private:
CoordinatorProcess* process;
http://git-wip-us.apache.org/repos/asf/mesos/blob/f9b60c4c/src/log/log.cpp
----------------------------------------------------------------------
diff --git a/src/log/log.cpp b/src/log/log.cpp
index d057925..e83f822 100644
--- a/src/log/log.cpp
+++ b/src/log/log.cpp
@@ -16,639 +16,893 @@
* limitations under the License.
*/
-// TODO(benh): Optimize LearnedMessage (and the "commit" stage in
-// general) by figuring out a way to not send the entire action
-// contents a second time (should cut bandwidth used in half).
-
-// TODO(benh): Provide a LearnRequest that requests more than one
-// position at a time, and a LearnResponse that returns as many
-// positions as it knows.
-
-// TODO(benh): Implement background catchup: have a new replica that
-// comes online become part of the group but don't respond to promises
-// or writes until it has caught up! The advantage to becoming part of
-// the group is that the new replica can see where the end of the log
-// is in order to continue to catch up.
-
-// TODO(benh): Add tests that deliberatly put the system in a state of
-// inconsistency by doing funky things to the underlying logs. Figure
-// out ways of bringing new replicas online that seem to check the
-// consistency of the other replicas.
-
-#include <list>
-#include <map>
-#include <set>
-#include <string>
-#include <vector>
-
-#include <boost/lexical_cast.hpp>
-
+#include <process/defer.hpp>
#include <process/dispatch.hpp>
+#include <process/id.hpp>
+#include <process/owned.hpp>
#include <process/process.hpp>
-#include <process/run.hpp>
+#include <process/shared.hpp>
#include <stout/check.hpp>
-#include <stout/duration.hpp>
-#include <stout/fatal.hpp>
+#include <stout/error.hpp>
#include <stout/foreach.hpp>
-#include <stout/os.hpp>
-#include <stout/result.hpp>
-
-#include "zookeeper/zookeeper.hpp"
+#include <stout/lambda.hpp>
+#include <stout/nothing.hpp>
+#include <stout/set.hpp>
#include "log/coordinator.hpp"
+#include "log/log.hpp"
+#include "log/network.hpp"
+#include "log/recover.hpp"
#include "log/replica.hpp"
-using namespace mesos;
-using namespace mesos::internal;
-using namespace mesos::internal::log;
-
using namespace process;
using std::list;
-using std::map;
-using std::pair;
using std::set;
using std::string;
-using std::vector;
+namespace mesos {
+namespace internal {
+namespace log {
-// class Drop : public Filter
-// {
-// public:
-// Drop
-// virtual bool filter(Message* message)
-// {
-// return == message->name;
-// }
-// };
+class LogProcess : public Process<LogProcess>
+{
+public:
+ LogProcess(
+ size_t _quorum,
+ const string& path,
+ const set<UPID>& pids);
+
+ LogProcess(
+ size_t _quorum,
+ const string& path,
+ const string& servers,
+ const Duration& timeout,
+ const string& znode,
+ const Option<zookeeper::Authentication>& auth);
+
+ // Recovers the log by catching up if needed. Returns a shared
+ // pointer to the local replica if the recovery succeeds.
+ Future<Shared<Replica> > recover();
+protected:
+ virtual void initialize();
+ virtual void finalize();
-// class PeriodicFilter
+private:
+ friend class LogReaderProcess;
+ friend class LogWriterProcess;
+ // Continuations.
+ void _recover();
-char** args; // Command line arguments for doing a restart.
+ // TODO(benh): Factor this out into "membership renewer".
+ void watch(
+ const UPID& pid,
+ const set<zookeeper::Group::Membership>& memberships);
+ void failed(const string& message);
+ void discarded();
-void restart()
-{
- LOG(INFO) << "Restarting ...";
- execv(args[0], args);
- fatalerror("Failed to exec");
-}
+ const size_t quorum;
+ Shared<Replica> replica;
+ Shared<Network> network;
+ // For replica recovery.
+ Option<Future<Owned<Replica> > > recovering;
+ process::Promise<Nothing> recovered;
+ list<process::Promise<Shared<Replica> >*> promises;
-bool coordinate(Coordinator* coordinator,
- uint64_t id,
- int end,
- map<int, int> truncations)
+ // For renewing membership. We store a Group instance in order to
+ // continually renew the replicas membership (when using ZooKeeper).
+ zookeeper::Group* group;
+ Future<zookeeper::Group::Membership> membership;
+};
+
+
+class LogReaderProcess : public Process<LogReaderProcess>
{
- const int attempts = 3;
+public:
+ LogReaderProcess(Log* log);
- uint64_t index;
+ Future<Log::Position> beginning();
+ Future<Log::Position> ending();
- int attempt = 1;
- while (true) {
- Result<uint64_t> result = coordinator->elect(id);
- if (result.isError()) {
- restart();
- } else if (result.isNone()) {
- if (attempt == attempts) {
- restart();
- } else {
- attempt++;
- os::sleep(Seconds(1));
- }
- } else {
- CHECK_SOME(result);
- index = result.get();
- break;
- }
- }
+ Future<list<Log::Entry> > read(
+ const Log::Position& from,
+ const Log::Position& to);
- uint64_t value = 0;
-
- if (index != 0) {
- attempt = 1;
- while (true) {
- Result<list<pair<uint64_t, string> > > result =
- coordinator->read(index, index);
- if (result.isError()) {
- LOG(INFO) << "Restarting due to read error";
- restart();
- } else if (result.isNone()) {
- if (attempt == attempts) {
- LOG(INFO) << "Restarting after too many attempts";
- restart();
- } else {
- attempt++;
- os::sleep(Seconds(1));
- }
- } else {
- CHECK_SOME(result);
- const list<pair<uint64_t, string> >& list = result.get();
- if (list.size() != 1) {
- index--;
- } else {
- try {
- value = boost::lexical_cast<uint64_t>(list.front().second);
- } catch (boost::bad_lexical_cast&) {
- LOG(INFO) << "Restarting due to conversion error";
- restart();
- }
- break;
- }
- }
- }
- }
+protected:
+ virtual void initialize();
+ virtual void finalize();
- value++;
-
- srand(time(NULL));
-
- int writes = rand() % 500;
-
- LOG(INFO) << "Attempting to do " << writes << " writes";
-
- attempt = 1;
- while (writes > 0 && value <= end) {
- if (truncations.count(value) > 0) {
- int to = truncations[value];
- Result<uint64_t> result = coordinator->truncate(to);
- if (result.isError()) {
- LOG(INFO) << "Restarting due to truncate error";
- restart();
- } else if (result.isNone()) {
- if (attempt == attempts) {
- LOG(INFO) << "Restarting after too many attempts";
- restart();
- } else {
- attempt++;
- os::sleep(Seconds(1));
- continue;
- }
- } else {
- CHECK_SOME(result);
- LOG(INFO) << "Truncated to " << to;
- os::sleep(Seconds(1));
- attempt = 1;
- }
- }
+private:
+ // Returns a position from a raw value.
+ static Log::Position position(uint64_t value);
- Result<uint64_t> result = coordinator->append(stringify(value));
- if (result.isError()) {
- LOG(INFO) << "Restarting due to append error";
- restart();
- } else if (result.isNone()) {
- if (attempt == attempts) {
- LOG(INFO) << "Restarting after too many attempts";
- restart();
- } else {
- attempt++;
- os::sleep(Seconds(1));
- }
- } else {
- CHECK_SOME(result);
- LOG(INFO) << "Wrote " << value;
- os::sleep(Seconds(1));
- writes--;
- value++;
- attempt = 1;
- }
- }
+ // Returns a future which gets set when the log recovery has
+ // finished (either succeeded or failed).
+ Future<Nothing> recover();
- exit(0);
- return true;
-}
+ // Continuations.
+ void _recover();
+ Future<Log::Position> _beginning();
+ Future<Log::Position> _ending();
-class LogProcess : public Process<LogProcess>
+ Future<list<Log::Entry> > _read(
+ const Log::Position& from,
+ const Log::Position& to);
+
+ Future<list<Log::Entry> > __read(
+ const Log::Position& from,
+ const Log::Position& to,
+ const list<Action>& actions);
+
+ Future<Shared<Replica> > recovering;
+ list<process::Promise<Nothing>*> promises;
+};
+
+
+class LogWriterProcess : public Process<LogWriterProcess>
{
public:
- LogProcess(int _quorum,
- const string& _file,
- const string& _servers,
- const string& _znode,
- int _end,
- const map<int, int>& _truncations);
-
- virtual ~LogProcess();
-
- // ZooKeeper events. TODO(*): Use a ZooKeeper listener?
- void connected();
- void reconnecting();
- void reconnected();
- void expired();
- void updated(const string& path);
+ LogWriterProcess(Log* log);
+
+ Future<Option<Log::Position> > elect();
+ Future<Log::Position> append(const string& bytes);
+ Future<Log::Position> truncate(const Log::Position& to);
protected:
- virtual void initialze();
+ virtual void initialize();
+ virtual void finalize();
private:
- // Updates the group.
- void regroup();
+ // Returns a position from a raw value.
+ static Log::Position position(uint64_t value);
- // Runs an election.
- void elect();
+ // Returns a future which gets set when the log recovery has
+ // finished (either succeeded or failed).
+ Future<Nothing> recover();
- // ZooKeeper bits and pieces.
- string servers;
- string znode;
- ZooKeeper* zk;
- Watcher* watcher;
+ // Continuations.
+ void _recover();
- // Size of quorum.
- int quorum;
+ Future<Option<Log::Position> > _elect();
+ Option<Log::Position> __elect(const Option<uint64_t>& result);
- // Log file.
- string file;
+ void failed(const string& message);
- // Termination value (when to stop writing to the log).
- int end;
+ const size_t quorum;
+ const Shared<Network> network;
- // Truncation points.
- map<int, int> truncations;
+ Future<Shared<Replica> > recovering;
+ list<process::Promise<Nothing>*> promises;
- // Coordinator id.
- uint64_t id;
+ Coordinator* coordinator;
+ Option<string> error;
+};
- // Whether or not the coordinator has been elected.
- bool elected;
- // Group members.
- set<UPID> members;
+/////////////////////////////////////////////////
+// Implementation of LogProcess.
+/////////////////////////////////////////////////
- ReplicaProcess* replica;
- GroupProcess* group;
- Coordinator* coordinator;
-};
+
+LogProcess::LogProcess(
+ size_t _quorum,
+ const string& path,
+ const set<UPID>& pids)
+ : ProcessBase(ID::generate("log")),
+ quorum(_quorum),
+ replica(new Replica(path)),
+ network(new Network(pids + (UPID) replica->pid())),
+ group(NULL) {}
+
+
+LogProcess::LogProcess(
+ size_t _quorum,
+ const string& path,
+ const string& servers,
+ const Duration& timeout,
+ const string& znode,
+ const Option<zookeeper::Authentication>& auth)
+ : ProcessBase(ID::generate("log")),
+ quorum(_quorum),
+ replica(new Replica(path)),
+ network(new ZooKeeperNetwork(servers, timeout, znode, auth)),
+ group(new zookeeper::Group(servers, timeout, znode, auth)) {}
-class LogProcessWatcher : public Watcher
+void LogProcess::initialize()
{
-public:
- LogProcessWatcher(const PID<LogProcess>& _pid)
- : pid(_pid), reconnect(false) {}
-
- virtual ~LogProcessWatcher() {}
-
- virtual void process(ZooKeeper* zk, int type, int state, const string& path)
- {
- if ((state == ZOO_CONNECTED_STATE) && (type == ZOO_SESSION_EVENT)) {
- // Check if this is a reconnect.
- if (!reconnect) {
- // Initial connect.
- dispatch(pid, &LogProcess::connected);
- } else {
- // Reconnected.
- dispatch(pid, &LogProcess::reconnected);
- }
- } else if ((state == ZOO_CONNECTING_STATE) &&
- (type == ZOO_SESSION_EVENT)) {
- // The client library automatically reconnects, taking into
- // account failed servers in the connection string,
- // appropriately handling the "herd effect", etc.
- reconnect = true;
- dispatch(pid, &LogProcess::reconnecting);
- } else if ((state == ZOO_EXPIRED_SESSION_STATE) &&
- (type == ZOO_SESSION_EVENT)) {
- dispatch(pid, &LogProcess::expired);
-
- // If this watcher is reused, the next connect won't be a reconnect.
- reconnect = false;
- } else if ((state == ZOO_CONNECTED_STATE) && (type == ZOO_CHILD_EVENT)) {
- dispatch(pid, &LogProcess::updated, path);
- } else if ((state == ZOO_CONNECTED_STATE) && (type == ZOO_CHANGED_EVENT)) {
- dispatch(pid, &LogProcess::updated, path);
- } else {
- LOG(FATAL) << "Unimplemented ZooKeeper event: (state is "
- << state << " and type is " << type << ")";
- }
+ if (group != NULL) {
+ // Need to add our replica to the ZooKeeper group!
+ LOG(INFO) << "Attempting to join replica to ZooKeeper group";
+
+ membership = group->join(replica->pid())
+ .onFailed(defer(self(), &Self::failed, lambda::_1))
+ .onDiscarded(defer(self(), &Self::discarded));
+
+ // We save and pass the pid of the replica to the 'watch' function
+ // because the field member 'replica' is not available during
+ // recovery. We need the pid to renew the replicas membership.
+ group->watch()
+ .onReady(defer(self(), &Self::watch, replica->pid(), lambda::_1))
+ .onFailed(defer(self(), &Self::failed, lambda::_1))
+ .onDiscarded(defer(self(), &Self::discarded));
}
-private:
- const PID<LogProcess> pid;
- bool reconnect;
-};
+ // Start the recovery.
+ recover();
+}
+
+
+void LogProcess::finalize()
+{
+ if (recovering.isSome()) {
+ // Stop the recovery if it is still pending.
+ recovering.get().discard();
+ }
+ // If there exist operations that are gated by the recovery, we fail
+ // all of them because the log is being deleted.
+ foreach (process::Promise<Shared<Replica> >* promise, promises) {
+ promise->fail("Log is being deleted");
+ delete promise;
+ }
+ promises.clear();
-LogProcess::LogProcess(int _quorum,
- const string& _file,
- const string& _servers,
- const string& _znode,
- int _end,
- const map<int, int>& _truncations)
- : quorum(_quorum),
- file(_file),
- servers(_servers),
- znode(_znode),
- end(_end),
- truncations(_truncations),
- id(0),
- elected(false),
- replica(NULL),
- group(NULL),
- coordinator(NULL) {}
-
-
-LogProcess::~LogProcess()
-{
- delete zk;
- delete watcher;
- delete replica;
delete group;
- delete coordinator;
+
+ // Wait for the shared pointers 'network' and 'replica' to become
+ // unique (i.e., no other reference to them). These calls should not
+ // be blocking for too long because at this moment, all operations
+ // should have been cancelled or are being cancelled. We do this
+ // because we want to make sure that after the log is deleted, all
+ // operations associated with this log are terminated.
+ network.own().await();
+ replica.own().await();
}
-void LogProcess::connected()
+Future<Shared<Replica> > LogProcess::recover()
{
- LOG(INFO) << "Log connected to ZooKeeper";
+ // The future 'recovered' is used to mark the success (or the
+ // failure) of the recovery. We do not use the future 'recovering'
+ // to do that because it can be set in other process and thus has a
+ // race condition which we want to avoid. We deliberately do not
+ // save replica in 'recovered' because it will complicate our
+ // deleting logic (see 'finalize').
+ Future<Nothing> future = recovered.future();
+
+ if (future.isDiscarded()) {
+ return Failure("Not expecting discarded future");
+ } else if (future.isFailed()) {
+ return Failure(future.failure());
+ } else if (future.isReady()) {
+ return replica;
+ }
+
+ // Recovery has not finished yet. Create a promise and queue it such
+ // that it can get notified once the recovery has finished (either
+ // succeeded or failed).
+ process::Promise<Shared<Replica> >* promise =
+ new process::Promise<Shared<Replica> >();
+
+ promises.push_back(promise);
+
+ if (recovering.isNone()) {
+ // TODO(jieyu): At this moment, we haven't shared 'replica' to
+ // others yet. Therefore, the following 'replica.own()' call
+ // should not be blocking. In the future, we may wanna support
+ // 'release' in Shared which will provide this CHECK internally.
+ CHECK(replica.unique());
+
+ recovering = log::recover(quorum, replica.own().get(), network)
+ .onAny(defer(self(), &Self::_recover));
+ }
+
+ return promise->future();
+}
+
- int ret;
- string result;
+void LogProcess::_recover()
+{
+ CHECK_SOME(recovering);
- // Assume the znode that was created does not end with a "/".
- CHECK(znode.size() == 0 || znode.at(znode.size() - 1) != '/');
+ Future<Owned<Replica> > future = recovering.get();
- // Create directory path znodes as necessary.
- size_t index = znode.find("/", 0);
+ if (!future.isReady()) {
+ // The 'future' here can only be discarded in 'finalize'.
+ string failure = future.isFailed() ?
+ future.failure() :
+ "The future 'recovering' is unexpectedly discarded";
- while (index < string::npos) {
- // Get out the prefix to create.
- index = znode.find("/", index + 1);
- string prefix = znode.substr(0, index);
+ // Mark the failure of the recovery.
+ recovered.fail(failure);
- LOG(INFO) << "Log trying to create znode '"
- << prefix << "' in ZooKeeper";
+ foreach (process::Promise<Shared<Replica> >* promise, promises) {
+ promise->fail(failure);
+ delete promise;
+ }
+ promises.clear();
+ } else {
+ replica = future.get().share();
- // Create the node (even if it already exists).
- ret = zk->create(
- prefix,
- "",
- ZOO_OPEN_ACL_UNSAFE,
- // ZOO_CREATOR_ALL_ACL, // needs authentication
- 0,
- &result);
+ // Mark the success of the recovery.
+ recovered.set(Nothing());
- if (ret != ZOK && ret != ZNODEEXISTS) {
- LOG(FATAL) << "Failed to create '" << prefix
- << "' in ZooKeeper: " << zk->message(ret);
+ foreach (process::Promise<Shared<Replica> >* promise, promises) {
+ promise->set(replica);
+ delete promise;
}
+ promises.clear();
}
+}
- // Now create the "replicas" znode.
- LOG(INFO) << "Log trying to create znode '" << znode
- << "/replicas" << "' in ZooKeeper";
- // Create the node (even if it already exists).
- ret = zk->create(znode + "/replicas", "", ZOO_OPEN_ACL_UNSAFE,
- // ZOO_CREATOR_ALL_ACL, // needs authentication
- 0, &result);
+void LogProcess::watch(
+ const UPID& pid,
+ const set<zookeeper::Group::Membership>& memberships)
+{
+ if (membership.isReady() && memberships.count(membership.get()) == 0) {
+ // Our replica's membership must have expired, join back up.
+ LOG(INFO) << "Renewing replica group membership";
- if (ret != ZOK && ret != ZNODEEXISTS) {
- LOG(FATAL) << "Failed to create '" << znode << "/replicas"
- << "' in ZooKeeper: " << zk->message(ret);
+ membership = group->join(pid)
+ .onFailed(defer(self(), &Self::failed, lambda::_1))
+ .onDiscarded(defer(self(), &Self::discarded));
}
- // Now create the "coordinators" znode.
- LOG(INFO) << "Log trying to create znode '" << znode
- << "/coordinators" << "' in ZooKeeper";
+ group->watch(memberships)
+ .onReady(defer(self(), &Self::watch, pid, lambda::_1))
+ .onFailed(defer(self(), &Self::failed, lambda::_1))
+ .onDiscarded(defer(self(), &Self::discarded));
+}
- // Create the node (even if it already exists).
- ret = zk->create(znode + "/coordinators", "", ZOO_OPEN_ACL_UNSAFE,
- // ZOO_CREATOR_ALL_ACL, // needs authentication
- 0, &result);
- if (ret != ZOK && ret != ZNODEEXISTS) {
- LOG(FATAL) << "Failed to create '" << znode << "/coordinators"
- << "' in ZooKeeper: " << zk->message(ret);
- }
+void LogProcess::failed(const string& message)
+{
+ LOG(FATAL) << "Failed to participate in ZooKeeper group: " << message;
+}
- // Okay, create our replica, group, and coordinator.
- replica = new ReplicaProcess(file);
- spawn(replica);
- group = new GroupProcess();
- spawn(group);
+void LogProcess::discarded()
+{
+ LOG(FATAL) << "Not expecting future to get discarded!";
+}
- coordinator = new Coordinator(quorum, replica, group);
- // Set a watch on the replicas.
- ret = zk->getChildren(znode + "/replicas", true, NULL);
+/////////////////////////////////////////////////
+// Implementation of LogReaderProcess.
+/////////////////////////////////////////////////
- if (ret != ZOK) {
- LOG(FATAL) << "Failed to set a watch on '" << znode << "/replicas"
- << "' in ZooKeeper: " << zk->message(ret);
- }
- // Set a watch on the coordinators.
- ret = zk->getChildren(znode + "/coordinators", true, NULL);
+LogReaderProcess::LogReaderProcess(Log* log)
+ : ProcessBase(ID::generate("log-reader")),
+ recovering(dispatch(log->process, &LogProcess::recover)) {}
- if (ret != ZOK) {
- LOG(FATAL) << "Failed to set a watch on '" << znode << "/replicas"
- << "' in ZooKeeper: " << zk->message(ret);
- }
- // Add an ephemeral znode for our replica and coordinator.
- ret = zk->create(znode + "/replicas/", replica->self(), ZOO_OPEN_ACL_UNSAFE,
- // ZOO_CREATOR_ALL_ACL, // needs authentication
- ZOO_SEQUENCE | ZOO_EPHEMERAL, &result);
+void LogReaderProcess::initialize()
+{
+ recovering.onAny(defer(self(), &Self::_recover));
+}
- if (ret != ZOK) {
- LOG(FATAL) << "Failed to create an ephmeral node at '" << znode
- << "/replica/" << "' in ZooKeeper: " << zk->message(ret);
+
+void LogReaderProcess::finalize()
+{
+ foreach (process::Promise<Nothing>* promise, promises) {
+ promise->fail("Log reader is being deleted");
+ delete promise;
}
+ promises.clear();
+}
- ret = zk->create(znode + "/coordinators/", "", ZOO_OPEN_ACL_UNSAFE,
- // ZOO_CREATOR_ALL_ACL, // needs authentication
- ZOO_SEQUENCE | ZOO_EPHEMERAL, &result);
- if (ret != ZOK) {
- LOG(FATAL) << "Failed to create an ephmeral node at '" << znode
- << "/replica/" << "' in ZooKeeper: " << zk->message(ret);
+Future<Nothing> LogReaderProcess::recover()
+{
+ if (recovering.isReady()) {
+ return Nothing();
+ } else if (recovering.isFailed()) {
+ return Failure(recovering.failure());
+ } else if (recovering.isDiscarded()) {
+ return Failure("The future 'recovering' is unexpectedly discarded");
}
- // Save the sequence id but only grab the basename, e.g.,
- // "/path/to/znode/000000131" => "000000131".
- result = utils::os::basename(result);
+ // At this moment, the future 'recovering' should most likely be
+ // pending. But it is also likely that it gets set after the above
+ // checks. Either way, we know that the continuation '_recover' has
+ // not been called yet (otherwise, we should not be able to reach
+ // here). The promise we are creating below will be properly
+ // set/failed when '_recover' is called.
+ process::Promise<Nothing>* promise = new process::Promise<Nothing>();
+ promises.push_back(promise);
+ return promise->future();
+}
+
- try {
- id = boost::lexical_cast<uint64_t>(result);
- } catch (boost::bad_lexical_cast&) {
- LOG(FATAL) << "Failed to convert '" << result << "' into an integer";
+void LogReaderProcess::_recover()
+{
+ if (!recovering.isReady()) {
+ foreach (process::Promise<Nothing>* promise, promises) {
+ promise->fail(
+ recovering.isFailed() ?
+ recovering.failure() :
+ "The future 'recovering' is unexpectedly discarded");
+ delete promise;
+ }
+ promises.clear();
+ } else {
+ foreach (process::Promise<Nothing>* promise, promises) {
+ promise->set(Nothing());
+ delete promise;
+ }
+ promises.clear();
}
+}
+
- // Run an election!
- elect();
+Future<Log::Position> LogReaderProcess::beginning()
+{
+ return recover().then(defer(self(), &Self::_beginning));
}
-void LogProcess::reconnecting()
+Future<Log::Position> LogReaderProcess::_beginning()
{
- LOG(INFO) << "Reconnecting to ZooKeeper";
+ CHECK(recovering.isReady());
+
+ return recovering.get()->beginning()
+ .then(lambda::bind(&Self::position, lambda::_1));
}
-void LogProcess::reconnected()
+Future<Log::Position> LogReaderProcess::ending()
{
- LOG(INFO) << "Reconnected to ZooKeeper";
+ return recover().then(defer(self(), &Self::_ending));
}
-void LogProcess::expired()
+Future<Log::Position> LogReaderProcess::_ending()
{
- restart();
+ CHECK(recovering.isReady());
+
+ return recovering.get()->ending()
+ .then(lambda::bind(&Self::position, lambda::_1));
}
-void LogProcess::updated(const string& path)
+Future<list<Log::Entry> > LogReaderProcess::read(
+ const Log::Position& from,
+ const Log::Position& to)
{
- if (znode + "/replicas" == path) {
+ return recover().then(defer(self(), &Self::_read, from, to));
+}
- regroup();
- // Reset a watch on the replicas.
- int ret = zk->getChildren(znode + "/replicas", true, NULL);
+Future<list<Log::Entry> > LogReaderProcess::_read(
+ const Log::Position& from,
+ const Log::Position& to)
+{
+ CHECK(recovering.isReady());
- if (ret != ZOK) {
- LOG(FATAL) << "Failed to set a watch on '" << znode << "/replicas"
- << "' in ZooKeeper: " << zk->message(ret);
- }
- } else {
- CHECK(znode + "/coordinators" == path);
+ return recovering.get()->read(from.value, to.value)
+ .then(defer(self(), &Self::__read, from, to, lambda::_1));
+}
- elect();
- // Reset a watch on the coordinators.
- int ret = zk->getChildren(znode + "/coordinators", true, NULL);
+Future<list<Log::Entry> > LogReaderProcess::__read(
+ const Log::Position& from,
+ const Log::Position& to,
+ const list<Action>& actions)
+{
+ list<Log::Entry> entries;
+
+ uint64_t position = from.value;
+
+ foreach (const Action& action, actions) {
+ // Ensure read range is valid.
+ if (!action.has_performed() ||
+ !action.has_learned() ||
+ !action.learned()) {
+ return Failure("Bad read range (includes pending entries)");
+ } else if (position++ != action.position()) {
+ return Failure("Bad read range (includes missing entries)");
+ }
- if (ret != ZOK) {
- LOG(FATAL) << "Failed to set a watch on '" << znode << "/replicas"
- << "' in ZooKeeper: " << zk->message(ret);
+ // And only return appends.
+ CHECK(action.has_type());
+ if (action.type() == Action::APPEND) {
+ entries.push_back(Log::Entry(action.position(), action.append().bytes()));
}
}
+
+ return entries;
}
-void LogProcess::initalize()
+Log::Position LogReaderProcess::position(uint64_t value)
{
- // TODO(benh): Real testing requires injecting a ZooKeeper instance.
- watcher = new LogProcessWatcher(self());
- zk = new ZooKeeper(servers, 10000, watcher);
+ return Log::Position(value);
}
-void LogProcess::regroup()
+/////////////////////////////////////////////////
+// Implementation of LogWriterProcess.
+/////////////////////////////////////////////////
+
+
+LogWriterProcess::LogWriterProcess(Log* log)
+ : ProcessBase(ID::generate("log-writer")),
+ quorum(log->process->quorum),
+ network(log->process->network),
+ recovering(dispatch(log->process, &LogProcess::recover)),
+ coordinator(NULL),
+ error(None()) {}
+
+
+void LogWriterProcess::initialize()
{
- vector<string> results;
+ recovering.onAny(defer(self(), &Self::_recover));
+}
- int ret = zk->getChildren(znode + "/replicas", false, &results);
- if (ret != ZOK) {
- LOG(FATAL) << "Failed to get children of '" << znode << "/replicas"
- << "' in ZooKeeper: " << zk->message(ret);
+void LogWriterProcess::finalize()
+{
+ foreach (process::Promise<Nothing>* promise, promises) {
+ promise->fail("Log writer is being deleted");
+ delete promise;
}
+ promises.clear();
+
+ delete coordinator;
+}
- set<UPID> current;
- set<UPID> added;
- set<UPID> removed;
- foreach (const string& result, results) {
- string s;
- int ret = zk->get(znode + "/replicas/" + result, false, &s, NULL);
- UPID pid = s;
- current.insert(pid);
+Future<Nothing> LogWriterProcess::recover()
+{
+ if (recovering.isReady()) {
+ return Nothing();
+ } else if (recovering.isFailed()) {
+ return Failure(recovering.failure());
+ } else if (recovering.isDiscarded()) {
+ return Failure("The future 'recovering' is unexpectedly discarded");
}
- foreach (const UPID& pid, current) {
- if (members.count(pid) == 0) {
- added.insert(pid);
+ // At this moment, the future 'recovering' should most likely be
+ // pending. But it is also likely that it gets set after the above
+ // checks. Either way, we know that the continuation '_recover' has
+ // not been called yet (otherwise, we should not be able to reach
+ // here). The promise we are creating below will be properly
+ // set/failed when '_recover' is called.
+ process::Promise<Nothing>* promise = new process::Promise<Nothing>();
+ promises.push_back(promise);
+ return promise->future();
+}
+
+
+void LogWriterProcess::_recover()
+{
+ if (!recovering.isReady()) {
+ foreach (process::Promise<Nothing>* promise, promises) {
+ promise->fail(
+ recovering.isFailed() ?
+ recovering.failure() :
+ "The future 'recovering' is unexpectedly discarded");
+ delete promise;
+ }
+ promises.clear();
+ } else {
+ foreach (process::Promise<Nothing>* promise, promises) {
+ promise->set(Nothing());
+ delete promise;
}
+ promises.clear();
}
+}
- foreach (const UPID& pid, members) {
- if (current.count(pid) == 0) {
- removed.insert(pid);
- }
+
+Future<Option<Log::Position> > LogWriterProcess::elect()
+{
+ return recover().then(defer(self(), &Self::_elect));
+}
+
+
+Future<Option<Log::Position> > LogWriterProcess::_elect()
+{
+ // We delete the existing coordinator (if exists) and create a new
+ // coordinator each time 'elect' is called.
+ delete coordinator;
+ error = None();
+
+ CHECK(recovering.isReady());
+
+ coordinator = new Coordinator(quorum, recovering.get(), network);
+
+ return coordinator->elect()
+ .then(defer(self(), &Self::__elect, lambda::_1))
+ .onFailed(defer(self(), &Self::failed, lambda::_1));
+}
+
+
+Option<Log::Position> LogWriterProcess::__elect(const Option<uint64_t>& result)
+{
+ if (result.isNone()) {
+ return None();
+ } else {
+ return position(result.get());
+ }
+}
+
+
+Future<Log::Position> LogWriterProcess::append(const string& bytes)
+{
+ if (coordinator == NULL) {
+ return Failure("No election has been performed");
}
- foreach (const UPID& pid, added) {
- dispatch(group, &GroupProcess::add, pid);
- members.insert(pid);
+ if (error.isSome()) {
+ return Failure(error.get());
}
- foreach (const UPID& pid, removed) {
- dispatch(group, &GroupProcess::remove, pid);
- members.erase(pid);
+ return coordinator->append(bytes)
+ .then(lambda::bind(&Self::position, lambda::_1))
+ .onFailed(defer(self(), &Self::failed, lambda::_1));
+}
+
+
+Future<Log::Position> LogWriterProcess::truncate(const Log::Position& to)
+{
+ if (coordinator == NULL) {
+ return Failure("No election has been performed");
}
+
+ if (error.isSome()) {
+ return Failure(error.get());
+ }
+
+ return coordinator->truncate(to.value)
+ .then(lambda::bind(&Self::position, lambda::_1))
+ .onFailed(defer(self(), &Self::failed, lambda::_1));
}
-void LogProcess::elect()
+void LogWriterProcess::failed(const string& message)
{
- vector<string> results;
+ error = message;
+}
- int ret = zk->getChildren(znode + "/coordinators", false, &results);
- if (ret != ZOK) {
- LOG(FATAL) << "Failed to get children of '" << znode << "/coordinators"
- << "' in ZooKeeper: " << zk->message(ret);
- }
+Log::Position LogWriterProcess::position(uint64_t value)
+{
+ return Log::Position(value);
+}
+
+
+/////////////////////////////////////////////////
+// Public interfaces for Log.
+/////////////////////////////////////////////////
+
+
+Log::Log(
+ int quorum,
+ const string& path,
+ const set<UPID>& pids)
+{
+ GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+ process = new LogProcess(quorum, path, pids);
+ spawn(process);
+}
+
+Log::Log(
+ int quorum,
+ const string& path,
+ const string& servers,
+ const Duration& timeout,
+ const string& znode,
+ const Option<zookeeper::Authentication>& auth)
+{
+ GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+ process = new LogProcess(quorum, path, servers, timeout, znode, auth);
+ spawn(process);
+}
+
+
+Log::~Log()
+{
+ terminate(process);
+ process::wait(process);
+ delete process;
+}
+
- // "Elect" the minimum ephemeral znode.
- uint64_t min = LONG_MAX;
- foreach (const string& result, results) {
- try {
- min = std::min(min, boost::lexical_cast<uint64_t>(result));
- } catch (boost::bad_lexical_cast&) {
- LOG(FATAL) << "Failed to convert '" << result << "' into an integer";
+/////////////////////////////////////////////////
+// Public interfaces for Log::Reader.
+/////////////////////////////////////////////////
+
+
+Log::Reader::Reader(Log* log)
+{
+ process = new LogReaderProcess(log);
+ spawn(process);
+}
+
+
+Log::Reader::~Reader()
+{
+ terminate(process);
+ process::wait(process);
+ delete process;
+}
+
+
+Result<list<Log::Entry> > Log::Reader::read(
+ const Log::Position& from,
+ const Log::Position& to,
+ const Timeout& timeout)
+{
+ Future<list<Log::Entry> > future =
+ dispatch(process, &LogReaderProcess::read, from, to);
+
+ if (!future.await(timeout.remaining())) {
+ LOG(INFO) << "Timed out while trying to read the log";
+
+ future.discard();
+ return None();
+ } else {
+ if (!future.isReady()) {
+ string failure =
+ future.isFailed() ?
+ future.failure() :
+ "Not expecting discarded future";
+
+ LOG(ERROR) << "Failed to read the log: " << failure;
+
+ return Error(failure);
+ } else {
+ return future.get();
}
}
+}
- if (id == min && !elected) {
- elected = true;
- process::run(&coordinate, coordinator, id, end, truncations);
- } else if (elected) {
- LOG(INFO) << "Restarting due to demoted";
- restart();
- }
+
+Log::Position Log::Reader::beginning()
+{
+ // TODO(benh): Take a timeout and return an Option.
+ return dispatch(process, &LogReaderProcess::beginning).get();
}
-int main(int argc, char** argv)
+Log::Position Log::Reader::ending()
{
- if (argc < 6) {
- fatal("Usage: %s <quorum> <file> <servers> <znode> <end> <at> <to> ...",
- argv[0]);
- }
+ // TODO(benh): Take a timeout and return an Option.
+ return dispatch(process, &LogReaderProcess::ending).get();
+}
- args = argv;
- int quorum = atoi(argv[1]);
- string file = argv[2];
- string servers = argv[3];
- string znode = argv[4];
- int end = atoi(argv[5]);
+/////////////////////////////////////////////////
+// Public interfaces for Log::Writer.
+/////////////////////////////////////////////////
- map<int, int> truncations;
- for (int i = 6; argv[i] != NULL; i += 2) {
- if (argv[i + 1] == NULL) {
- fatal("Expecting 'to' argument for truncation");
+Log::Writer::Writer(Log* log, const Duration& timeout, int retries)
+{
+ process = new LogWriterProcess(log);
+ spawn(process);
+
+ // Trying to get elected.
+ for (;;) {
+ LOG(INFO) << "Attempting to get elected within " << timeout;
+
+ Future<Option<Log::Position> > future =
+ dispatch(process, &LogWriterProcess::elect);
+
+ if (!future.await(timeout)) {
+ LOG(INFO) << "Timed out while trying to get elected";
+
+ // Cancel the election. It is likely that the election is done
+ // right after the timeout has been reached. In that case, we
+ // may unnecessarily rerun the election, but it is safe.
+ future.discard();
+ } else {
+ if (!future.isReady()) {
+ string failure =
+ future.isFailed() ?
+ future.failure() :
+ "Not expecting discarded future";
+
+ LOG(ERROR) << "Failed to get elected: " << failure;
+ break;
+ } else if (future.get().isNone()) {
+ LOG(INFO) << "Lost an election, but can be retried";
+ } else {
+ LOG(INFO) << "Elected with current position "
+ << future.get().get().value;
+ return;
+ }
+ }
+
+ if (--retries < 0) {
+ LOG(ERROR) << "Retry limit has been reached during election";
+ break;
}
+ }
+}
+
+
+Log::Writer::~Writer()
+{
+ terminate(process);
+ process::wait(process);
+ delete process;
+}
+
+
+Result<Log::Position> Log::Writer::append(
+ const string& data,
+ const Timeout& timeout)
+{
+ LOG(INFO) << "Attempting to append " << data.size() << " bytes to the log";
+
+ Future<Log::Position> future =
+ dispatch(process, &LogWriterProcess::append, data);
+
+ if (!future.await(timeout.remaining())) {
+ LOG(INFO) << "Timed out while trying to append the log";
- int at = atoi(argv[i]);
- int to = atoi(argv[i + 1]);
+ future.discard();
+ return None();
+ } else {
+ if (!future.isReady()) {
+ string failure =
+ future.isFailed() ?
+ future.failure() :
+ "Not expecting discarded future";
+
+ LOG(ERROR) << "Failed to append the log: " << failure;
- truncations[at] = to;
+ return Error(failure);
+ } else {
+ return future.get();
+ }
}
+}
+
+
+Result<Log::Position> Log::Writer::truncate(
+ const Log::Position& to,
+ const Timeout& timeout)
+{
+ LOG(INFO) << "Attempting to truncate the log to " << to.value;
- process::initialize();
+ Future<Log::Position> future =
+ dispatch(process, &LogWriterProcess::truncate, to);
- LogProcess log(quorum, file, servers, znode, end, truncations);
- spawn(log);
- wait(log);
+ if (!future.await(timeout.remaining())) {
+ LOG(INFO) << "Timed out while trying to truncate the log";
- return 0;
+ future.discard();
+ return None();
+ } else {
+ if (!future.isReady()) {
+ string failure =
+ future.isFailed() ?
+ future.failure() :
+ "Not expecting discarded future";
+
+ LOG(ERROR) << "Failed to truncate the log: " << failure;
+
+ return Error(failure);
+ } else {
+ return future.get();
+ }
+ }
}
+
+} // namespace log {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/f9b60c4c/src/log/log.hpp
----------------------------------------------------------------------
diff --git a/src/log/log.hpp b/src/log/log.hpp
index 042f13b..1f0b30d 100644
--- a/src/log/log.hpp
+++ b/src/log/log.hpp
@@ -28,15 +28,10 @@
#include <process/shared.hpp>
#include <process/timeout.hpp>
-#include <stout/check.hpp>
-#include <stout/error.hpp>
-#include <stout/foreach.hpp>
+#include <stout/duration.hpp>
#include <stout/none.hpp>
+#include <stout/option.hpp>
#include <stout/result.hpp>
-#include <stout/try.hpp>
-
-#include "log/coordinator.hpp"
-#include "log/replica.hpp"
#include "zookeeper/group.hpp"
@@ -44,6 +39,12 @@ namespace mesos {
namespace internal {
namespace log {
+// Forward declarations.
+class LogProcess;
+class LogReaderProcess;
+class LogWriterProcess;
+
+
class Log
{
public:
@@ -98,9 +99,12 @@ public:
private:
friend class Log;
- friend class Reader;
friend class Writer;
+ friend class LogReaderProcess;
+ friend class LogWriterProcess;
+
Position(uint64_t _value) : value(_value) {}
+
uint64_t value;
};
@@ -111,8 +115,8 @@ public:
std::string data;
private:
- friend class Reader;
- friend class Writer;
+ friend class LogReaderProcess;
+
Entry(const Position& _position, const std::string& _data)
: position(_position), data(_data) {}
};
@@ -125,9 +129,10 @@ public:
// Returns all entries between the specified positions, unless
// those positions are invalid, in which case returns an error.
- Result<std::list<Entry> > read(const Position& from,
- const Position& to,
- const process::Timeout& timeout);
+ Result<std::list<Entry> > read(
+ const Position& from,
+ const Position& to,
+ const process::Timeout& timeout);
// Returns the beginning position of the log from the perspective
// of the local replica (which may be out of date if the log has
@@ -141,7 +146,7 @@ public:
Position ending();
private:
- process::Shared<Replica> replica;
+ LogReaderProcess* process;
};
class Writer
@@ -171,69 +176,28 @@ public:
const process::Timeout& timeout);
private:
- Option<std::string> error;
- Coordinator coordinator;
+ LogWriterProcess* process;
};
// Creates a new replicated log that assumes the specified quorum
- // size, is backed by a file at the specified path, and coordiantes
+ // size, is backed by a file at the specified path, and coordinates
// with other replicas via the set of process PIDs.
- Log(int _quorum,
+ Log(int quorum,
const std::string& path,
- const std::set<process::UPID>& pids)
- : group(NULL),
- executor(NULL),
- quorum(_quorum),
- replica(new Replica(path))
- {
- GOOGLE_PROTOBUF_VERIFY_VERSION;
-
- // Add our own replica to the network.
- Network* _network = new Network(pids);
- _network->add(replica->pid());
-
- network.reset(_network);
- }
+ const std::set<process::UPID>& pids);
// Creates a new replicated log that assumes the specified quorum
- // size, is backed by a file at the specified path, and coordiantes
+ // size, is backed by a file at the specified path, and coordinates
// with other replicas associated with the specified ZooKeeper
// servers, timeout, and znode.
- Log(int _quorum,
+ Log(int quorum,
const std::string& path,
const std::string& servers,
const Duration& timeout,
const std::string& znode,
- const Option<zookeeper::Authentication>& auth = None())
- : group(new zookeeper::Group(servers, timeout, znode, auth)),
- executor(new process::Executor()),
- quorum(_quorum),
- replica(new Replica(path)),
- network(new ZooKeeperNetwork(servers, timeout, znode, auth))
- {
- GOOGLE_PROTOBUF_VERIFY_VERSION;
+ const Option<zookeeper::Authentication>& auth = None());
- // Need to add our replica to the ZooKeeper group!
- LOG(INFO) << "Attempting to join replica to ZooKeeper group";
-
- membership = group->join(replica->pid())
- .onFailed(executor->defer(lambda::bind(&Log::failed, this, lambda::_1)))
- .onDiscarded(executor->defer(lambda::bind(&Log::discarded, this)));
-
- group->watch()
- .onReady(executor->defer(lambda::bind(&Log::watch, this, lambda::_1)))
- .onFailed(executor->defer(lambda::bind(&Log::failed, this, lambda::_1)))
- .onDiscarded(executor->defer(lambda::bind(&Log::discarded, this)));
- }
-
- ~Log()
- {
- network.own().await();
- replica.own().await();
-
- delete executor;
- delete group;
- }
+ ~Log();
// Returns a position based off of the bytes recovered from
// Position.identity().
@@ -252,196 +216,13 @@ public:
((uint64_t) (bytes[7] & 0xff));
return Position(value);
}
-private:
- friend class Reader;
- friend class Writer;
-
- // TODO(benh): Factor this out into some sort of "membership renewer".
- void watch(const std::set<zookeeper::Group::Membership>& memberships);
- void failed(const std::string& message) const;
- void discarded() const;
-
- // We store a Group instance in order to continually renew the
- // replicas membership (when using ZooKeeper).
- zookeeper::Group* group;
- process::Future<zookeeper::Group::Membership> membership;
- process::Executor* executor;
-
- int quorum;
- process::Shared<Replica> replica;
- process::Shared<Network> network;
-};
-
-
-Log::Reader::Reader(Log* log)
- : replica(log->replica) {}
-
-
-Log::Reader::~Reader() {}
-
-
-Result<std::list<Log::Entry> > Log::Reader::read(
- const Log::Position& from,
- const Log::Position& to,
- const process::Timeout& timeout)
-{
- process::Future<std::list<Action> > actions =
- replica->read(from.value, to.value);
-
- if (!actions.await(timeout.remaining())) {
- return None();
- } else if (actions.isFailed()) {
- return Error(actions.failure());
- }
-
- CHECK(actions.isReady()) << "Not expecting discarded future!";
-
- std::list<Log::Entry> entries;
-
- uint64_t position = from.value;
-
- foreach (const Action& action, actions.get()) {
- // Ensure read range is valid.
- if (!action.has_performed() ||
- !action.has_learned() ||
- !action.learned()) {
- return Error("Bad read range (includes pending entries)");
- } else if (position++ != action.position()) {
- return Error("Bad read range (includes missing entries)");
- }
-
- // And only return appends.
- CHECK(action.has_type());
- if (action.type() == Action::APPEND) {
- entries.push_back(Entry(action.position(), action.append().bytes()));
- }
- }
-
- return entries;
-}
-
-
-Log::Position Log::Reader::beginning()
-{
- // TODO(benh): Take a timeout and return an Option.
- process::Future<uint64_t> value = replica->beginning();
- value.await();
- CHECK(value.isReady()) << "Not expecting a failed or discarded future!";
- return Log::Position(value.get());
-}
-
-
-Log::Position Log::Reader::ending()
-{
- // TODO(benh): Take a timeout and return an Option.
- process::Future<uint64_t> value = replica->ending();
- value.await();
- CHECK(value.isReady()) << "Not expecting a failed or discarded future!";
- return Log::Position(value.get());
-}
-
-
-Log::Writer::Writer(Log* log, const Duration& timeout, int retries)
- : error(None()),
- coordinator(log->quorum, log->replica, log->network)
-{
- do {
- Result<uint64_t> result = coordinator.elect(process::Timeout::in(timeout));
- if (result.isNone()) {
- retries--;
- } else if (result.isSome()) {
- break;
- } else {
- error = result.error();
- break;
- }
- } while (retries > 0);
-}
-
-
-Log::Writer::~Writer()
-{
- coordinator.demote();
-}
-
-
-Result<Log::Position> Log::Writer::append(
- const std::string& data,
- const process::Timeout& timeout)
-{
- if (error.isSome()) {
- return Error(error.get());
- }
-
- LOG(INFO) << "Attempting to append " << data.size() << " bytes to the log";
-
- Result<uint64_t> result = coordinator.append(data, timeout);
-
- if (result.isError()) {
- error = result.error();
- return Error(error.get());
- } else if (result.isNone()) {
- return None();
- }
-
- CHECK_SOME(result);
-
- return Log::Position(result.get());
-}
-
-
-Result<Log::Position> Log::Writer::truncate(
- const Log::Position& to,
- const process::Timeout& timeout)
-{
- if (error.isSome()) {
- return Error(error.get());
- }
-
- LOG(INFO) << "Attempting to truncate the log to " << to.value;
-
- Result<uint64_t> result = coordinator.truncate(to.value, timeout);
-
- if (result.isError()) {
- error = result.error();
- return Error(error.get());
- } else if (result.isNone()) {
- return None();
- }
-
- CHECK_SOME(result);
-
- return Log::Position(result.get());
-}
-
-
-void Log::watch(const std::set<zookeeper::Group::Membership>& memberships)
-{
- if (membership.isReady() && memberships.count(membership.get()) == 0) {
- // Our replica's membership must have expired, join back up.
- LOG(INFO) << "Renewing replica group membership";
- membership = group->join(replica->pid())
- .onFailed(executor->defer(lambda::bind(&Log::failed, this, lambda::_1)))
- .onDiscarded(executor->defer(lambda::bind(&Log::discarded, this)));
- }
-
- group->watch(memberships)
- .onReady(executor->defer(lambda::bind(&Log::watch, this, lambda::_1)))
- .onFailed(executor->defer(lambda::bind(&Log::failed, this, lambda::_1)))
- .onDiscarded(executor->defer(lambda::bind(&Log::discarded, this)));
-}
-
-
-void Log::failed(const std::string& message) const
-{
- LOG(FATAL) << "Failed to participate in ZooKeeper group: " << message;
-}
+private:
+ friend class LogReaderProcess;
+ friend class LogWriterProcess;
-void Log::discarded() const
-{
- LOG(FATAL) << "Not expecting future to get discarded!";
-}
+ LogProcess* process;
+};
} // namespace log {
} // namespace internal {
http://git-wip-us.apache.org/repos/asf/mesos/blob/f9b60c4c/src/log/recover.cpp
----------------------------------------------------------------------
diff --git a/src/log/recover.cpp b/src/log/recover.cpp
new file mode 100644
index 0000000..0ab8e95
--- /dev/null
+++ b/src/log/recover.cpp
@@ -0,0 +1,403 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdlib.h>
+
+#include <set>
+
+#include <process/defer.hpp>
+#include <process/delay.hpp>
+#include <process/id.hpp>
+#include <process/process.hpp>
+
+#include <stout/foreach.hpp>
+#include <stout/hashmap.hpp>
+#include <stout/lambda.hpp>
+#include <stout/none.hpp>
+#include <stout/option.hpp>
+
+#include "common/type_utils.hpp"
+
+#include "log/catchup.hpp"
+#include "log/recover.hpp"
+
+#include "messages/log.hpp"
+
+using namespace process;
+
+using std::set;
+
+namespace mesos {
+namespace internal {
+namespace log {
+
+// This process is used to recover a replica. The flow of the recover
+// process is described as follows:
+// A) Check the status of the local replica.
+// A1) If it is VOTING, exit.
+// A2) If it is not VOTING, goto (B).
+// B) Broadcast a RecoverRequest to all replicas in the network.
+// B1) <<< Catch-up >>> If a quorum of replicas are found in VOTING
+// status (no matter what the status of the local replica is),
+// set the status of the local replica to RECOVERING, and start
+// doing catch-up. If the local replica has been caught-up, set
+// the status of the local replica to VOTING and exit.
+// B2) If a quorum is not found, goto (B).
+//
+// In the following, we list a few scenarios and show how the recover
+// process will respond in those scenarios. All the examples assume a
+// quorum size of 2. Remember that a new replica is always put in
+// EMPTY status initially.
+//
+// 1) Replica A, B and C are all in VOTING status. The operator adds
+// replica D. In that case, D will go into RECOVERING status and
+// then go into VOTING status. Therefore, we should avoid adding a
+// new replica unless we know that one replica has been removed.
+//
+// 2) Replica A and B are in VOTING status. The operator adds replica
+// C. In that case, C will go into RECOVERING status and then go
+// into VOTING status, which is expected.
+//
+// 3) Replica A is in VOTING status. The operator adds replica B. In
+// that case, B will stay in EMPTY status forever. This is expected
+// because we cannot make progress if VOTING replicas are not
+// enough (i.e., less than quorum).
+//
+// 4) Replica A is in VOTING status and B is in EMPTY status. The
+// operator adds replica C. In that case, C will stay in EMPTY
+// status forever similar to case 3).
+class RecoverProcess : public Process<RecoverProcess>
+{
+public:
+ RecoverProcess(
+ size_t _quorum,
+ const Owned<Replica>& _replica,
+ const Shared<Network>& _network)
+ : ProcessBase(ID::generate("log-recover")),
+ quorum(_quorum),
+ replica(_replica),
+ network(_network) {}
+
+ Future<Owned<Replica> > future() { return promise.future(); }
+
+protected:
+ virtual void initialize()
+ {
+ LOG(INFO) << "Start recovering a replica";
+
+ // Stop when no one cares.
+ promise.future().onDiscarded(lambda::bind(
+ static_cast<void(*)(const UPID&, bool)>(terminate), self(), true));
+
+ // Check the current status of the local replica and decide if
+ // recovery is needed. Recovery is needed if the local replica is
+ // not in VOTING status.
+ replica->status().onAny(defer(self(), &Self::checked, lambda::_1));
+ }
+
+ virtual void finalize()
+ {
+ LOG(INFO) << "Recover process terminated";
+
+ // Cancel all operations if they are still pending.
+ discard(responses);
+ catching.discard();
+ }
+
+private:
+ void checked(const Future<Metadata::Status>& future)
+ {
+ if (!future.isReady()) {
+ promise.fail(
+ future.isFailed() ?
+ "Failed to get replica status: " + future.failure() :
+ "Not expecting discarded future");
+
+ terminate(self());
+ return;
+ }
+
+ status = future.get();
+
+ LOG(INFO) << "Replica is in " << status << " status";
+
+ if (status == Metadata::VOTING) {
+ promise.set(replica);
+ terminate(self());
+ } else {
+ recover();
+ }
+ }
+
+ void recover()
+ {
+ CHECK_NE(status, Metadata::VOTING);
+
+ // Broadcast recover request to all replicas.
+ network->broadcast(protocol::recover, RecoverRequest())
+ .onAny(defer(self(), &Self::broadcasted, lambda::_1));
+ }
+
+ void broadcasted(const Future<set<Future<RecoverResponse> > >& future)
+ {
+ if (!future.isReady()) {
+ promise.fail(
+ future.isFailed() ?
+ "Failed to broadcast the recover request: " + future.failure() :
+ "Not expecting discarded future");
+
+ terminate(self());
+ return;
+ }
+
+ responses = future.get();
+
+ if (responses.empty()) {
+ // Retry if no replica is currently in the network.
+ retry();
+ } else {
+ // Instead of using a for loop here, we use select to process
+ // responses one after another so that we can ignore the rest if
+ // we have collected enough responses.
+ select(responses)
+ .onReady(defer(self(), &Self::received, lambda::_1));
+
+ // Reset the counters.
+ responsesReceived.clear();
+ lowestBeginPosition = None();
+ highestEndPosition = None();
+ }
+ }
+
+ void received(const Future<RecoverResponse>& future)
+ {
+ // Enforced by the select semantics.
+ CHECK(future.isReady());
+
+ // Remove this future from 'responses' so that we do not listen on
+ // it the next time we invoke select.
+ responses.erase(future);
+
+ const RecoverResponse& response = future.get();
+
+ LOG(INFO) << "Received a recover response from a replica in "
+ << response.status() << " status";
+
+ responsesReceived[response.status()]++;
+
+ // We need to remember the lowest begin position and highest end
+ // position seen from VOTING replicas.
+ if (response.status() == Metadata::VOTING) {
+ CHECK(response.has_begin() && response.has_end());
+
+ lowestBeginPosition = min(lowestBeginPosition, response.begin());
+ highestEndPosition = max(highestEndPosition, response.end());
+ }
+
+ // If we got responses from a quorum of VOTING replicas, the local
+ // replica will be put in RECOVERING status and start catching up.
+ // It is likely that the local replica is in RECOVERING status
+ // already. This is the case where the replica crashes during
+ // catch-up. When it restarts, we need to recalculate the lowest
+ // begin position and the highest end position since we haven't
+ // persisted this information on disk.
+ if (responsesReceived[Metadata::VOTING] >= quorum) {
+ discard(responses);
+ update(Metadata::RECOVERING);
+ return;
+ }
+
+ if (responses.empty()) {
+ // All responses have been received but neither have we received
+ // enough responses from VOTING replicas to do catch-up, nor are
+ // we in start-up case. This is either because we don't have
+ // enough replicas in the network (e.g. ZooKeeper blip), or we
+ // don't have enough VOTING replicas to proceed. We will retry
+ // the recovery in both cases.
+ retry();
+ } else {
+ // Wait for the next response.
+ select(responses)
+ .onReady(defer(self(), &Self::received, lambda::_1));
+ }
+ }
+
+ void update(const Metadata::Status& _status)
+ {
+ LOG(INFO) << "Updating replica status from "
+ << status << " to " << _status;
+
+ replica->update(_status)
+ .onAny(defer(self(), &Self::updated, _status, lambda::_1));
+ }
+
+ void updated(const Metadata::Status& _status, const Future<bool>& future)
+ {
+ if (!future.isReady()) {
+ promise.fail(
+ future.isFailed() ?
+ "Failed to update replica status: " + future.failure() :
+ "Not expecting discarded future");
+
+ terminate(self());
+ return;
+ } else if (!future.get()) {
+ promise.fail("Failed to update replica status");
+ terminate(self());
+ return;
+ }
+
+ // The replica status has been updated successfully. Depending on
+ // the new status, we decide what the next action should be.
+ status = _status;
+
+ if (status == Metadata::VOTING) {
+ LOG(INFO) << "Successfully joined the Paxos group";
+
+ promise.set(replica);
+ terminate(self());
+ } else if (status == Metadata::RECOVERING) {
+ catchup();
+ } else {
+ // The replica should not be in any other status.
+ LOG(FATAL) << "Unexpected replica status";
+ }
+ }
+
+ void catchup()
+ {
+ // We reach here either because the log is empty (uninitialized),
+ // or the log is not empty but a previous unfinished catch-up
+ // attempt has been detected (the process crashes/killed when
+ // catching up). In either case, the local replica may have lost
+ // some data and Paxos states, and should not be allowed to vote.
+ // Otherwise, we may introduce inconsistency in the log as the
+ // local replica could have accepted a write which it would not
+ // have accepted if the data and the Paxos states were not lost.
+ // Now, the question is how many positions the local replica
+ // should catch up before it can be allowed to vote. We find that
+ // it is sufficient to catch-up positions from _begin_ to _end_
+ // where _begin_ is the smallest position seen in a quorum of
+ // VOTING replicas and _end_ is the largest position seen in a
+ // quorum of VOTING replicas. Here is the correctness argument.
+ // For a position _e_ larger than _end_, obviously no value has
+ // been agreed on for that position. Otherwise, we should find at
+ // least one VOTING replica in a quorum of replicas such that its
+ // end position is larger than _end_. For the same reason, a
+ // coordinator should not have collected enough promises for
+ // position _e_. Therefore, it's safe for the local replica to
+ // vote for that position. For a position _b_ smaller than
+ // _begin_, it should have already been truncated and the
+ // truncation should have already been agreed. Therefore, allowing
+ // the local replica to vote for that position is safe.
+ CHECK(lowestBeginPosition.isSome());
+ CHECK(highestEndPosition.isSome());
+ CHECK_LE(lowestBeginPosition.get(), highestEndPosition.get());
+
+ uint64_t begin = lowestBeginPosition.get();
+ uint64_t end = highestEndPosition.get();
+
+ set<uint64_t> positions;
+ for (uint64_t p = begin; p <= end; ++p) {
+ positions.insert(p);
+ }
+
+ // Share the ownership of the replica. From this point until the
+ // point where the ownership of the replica is regained, we should
+ // not access the 'replica' field.
+ Shared<Replica> shared = replica.share();
+
+ // Since we do not know what proposal number to use (the log is
+ // empty), we use proposal number 0 and leave log::catchup to
+ // automatically bump the proposal number.
+ catching = log::catchup(quorum, shared, network, 0, positions);
+ catching.onAny(defer(self(), &Self::caughtup, shared, lambda::_1));
+ }
+
+ void caughtup(Shared<Replica> shared, const Future<Nothing>& future)
+ {
+ if (!future.isReady()) {
+ promise.fail(
+ future.isFailed() ?
+ "Failed to catch-up: " + future.failure() :
+ "Not expecting discarded future");
+
+ terminate(self());
+ } else {
+ // Try to regain the ownership of the replica.
+ shared.own().onAny(defer(self(), &Self::owned, lambda::_1));
+ }
+ }
+
+ void owned(const Future<Owned<Replica> >& future)
+ {
+ if (!future.isReady()) {
+ promise.fail(
+ future.isFailed() ?
+ "Failed to own the replica: " + future.failure() :
+ "Not expecting discarded future");
+
+ terminate(self());
+ } else {
+ // Allow the replica to vote once the catch-up is done.
+ replica = future.get();
+ update(Metadata::VOTING);
+ }
+ }
+
+ void retry()
+ {
+ // We add a random delay before each retry because we do not want
+ // to saturate the network/disk IO in some cases (e.g., network
+ // size is less than quorum). The delay is chosen randomly to
+ // reduce the likelihood of conflicts (i.e., a replica receives a
+ // recover request while it is changing its status).
+ static const Duration T = Milliseconds(500);
+ Duration d = T * (1.0 + (double) ::random() / RAND_MAX);
+ delay(d, self(), &Self::recover);
+ }
+
+ const size_t quorum;
+ Owned<Replica> replica;
+ const Shared<Network> network;
+
+ Metadata::Status status;
+ set<Future<RecoverResponse> > responses;
+ hashmap<Metadata::Status, size_t> responsesReceived;
+ Option<uint64_t> lowestBeginPosition;
+ Option<uint64_t> highestEndPosition;
+ Future<Nothing> catching;
+
+ process::Promise<Owned<Replica> > promise;
+};
+
+
+Future<Owned<Replica> > recover(
+ size_t quorum,
+ const Owned<Replica>& replica,
+ const Shared<Network>& network)
+{
+ RecoverProcess* process = new RecoverProcess(quorum, replica, network);
+ Future<Owned<Replica> > future = process->future();
+ spawn(process, true);
+ return future;
+}
+
+} // namespace log {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/f9b60c4c/src/log/recover.hpp
----------------------------------------------------------------------
diff --git a/src/log/recover.hpp b/src/log/recover.hpp
new file mode 100644
index 0000000..634bc06
--- /dev/null
+++ b/src/log/recover.hpp
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LOG_RECOVER_HPP__
+#define __LOG_RECOVER_HPP__
+
+#include <stdint.h>
+
+#include <process/future.hpp>
+#include <process/owned.hpp>
+#include <process/shared.hpp>
+
+#include <stout/nothing.hpp>
+
+#include "log/network.hpp"
+#include "log/replica.hpp"
+
+namespace mesos {
+namespace internal {
+namespace log {
+
+// Recovers a replica by catching up enough missing positions. A
+// replica starts with an empty log (e.g., in the case of a disk
+// failure) should not be allowed to vote. Otherwise, the new votes it
+// makes may contradict its lost votes, leading to potential
+// inconsistency in the log. Instead, the replica should be put in
+// non-voting status and catch up missing positions (and associated
+// Paxos states). The replica can be re-allowed to vote if the
+// following two conditions are met: 1) a sufficient amount of missing
+// positions are recovered such that if other replicas fail, the
+// remaining replicas can restore all the successfully written log
+// entries; 2) its future votes cannot not contradict its lost votes.
+// This function returns an owned pointer to the recovered replica if
+// the recovery is successful.
+extern process::Future<process::Owned<Replica> > recover(
+ size_t quorum,
+ const process::Owned<Replica>& replica,
+ const process::Shared<Network>& network);
+
+} // namespace log {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __LOG_RECOVER_HPP__
[06/10] git commit: Refactored log tools and added a tool to
initialize the log.
Posted by be...@apache.org.
Refactored log tools and added a tool to initialize the log.
Also pulled storage related code out (I haven't changed them) from
replica.cpp.
From: Jie Yu <yu...@gmail.com>
Review: https://reviews.apache.org/r/16433
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/e2fe5860
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/e2fe5860
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/e2fe5860
Branch: refs/heads/master
Commit: e2fe5860bc8542e5408bc86ac7322002326d41b3
Parents: f9b60c4
Author: Benjamin Hindman <be...@gmail.com>
Authored: Thu Jan 16 16:55:00 2014 -0800
Committer: Benjamin Hindman <be...@gmail.com>
Committed: Thu Jan 16 16:55:00 2014 -0800
----------------------------------------------------------------------
src/Makefile.am | 14 +-
src/log/leveldb.cpp | 422 +++++++++++++++++++++++++++++++++++++
src/log/leveldb.hpp | 51 +++++
src/log/main.cpp | 132 +++++-------
src/log/replica.cpp | 439 +--------------------------------------
src/log/replica.hpp | 6 +-
src/log/storage.hpp | 61 ++++++
src/log/tool.hpp | 51 +++++
src/log/tool/initialize.cpp | 148 +++++++++++++
src/log/tool/initialize.hpp | 63 ++++++
src/log/tool/read.cpp | 188 +++++++++++++++++
src/log/tool/read.hpp | 65 ++++++
12 files changed, 1120 insertions(+), 520 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/e2fe5860/src/Makefile.am
----------------------------------------------------------------------
diff --git a/src/Makefile.am b/src/Makefile.am
index 60fcb31..d58b46e 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -301,17 +301,25 @@ liblog_la_SOURCES = \
log/catchup.cpp \
log/consensus.cpp \
log/coordinator.cpp \
+ log/leveldb.cpp \
log/log.cpp \
log/recover.cpp \
- log/replica.cpp
+ log/replica.cpp \
+ log/tool/initialize.cpp \
+ log/tool/read.cpp
liblog_la_SOURCES += \
log/catchup.hpp \
log/consensus.hpp \
log/coordinator.hpp \
- log/recover.hpp \
- log/replica.hpp \
+ log/leveldb.hpp \
log/log.hpp \
log/network.hpp \
+ log/recover.hpp \
+ log/replica.hpp \
+ log/storage.hpp \
+ log/tool.hpp \
+ log/tool/initialize.hpp \
+ log/tool/read.hpp \
messages/log.hpp \
messages/log.proto
nodist_liblog_la_SOURCES = $(LOG_PROTOS)
http://git-wip-us.apache.org/repos/asf/mesos/blob/e2fe5860/src/log/leveldb.cpp
----------------------------------------------------------------------
diff --git a/src/log/leveldb.cpp b/src/log/leveldb.cpp
new file mode 100644
index 0000000..7819963
--- /dev/null
+++ b/src/log/leveldb.cpp
@@ -0,0 +1,422 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+
+#include <leveldb/comparator.h>
+#include <leveldb/write_batch.h>
+
+#include <stout/check.hpp>
+#include <stout/error.hpp>
+#include <stout/numify.hpp>
+#include <stout/stopwatch.hpp>
+#include <stout/strings.hpp>
+
+#include "log/leveldb.hpp"
+
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace log {
+
+class Varint64Comparator : public leveldb::Comparator
+{
+public:
+ virtual int Compare(
+ const leveldb::Slice& a,
+ const leveldb::Slice& b) const
+ {
+ // TODO(benh): Use varint comparator.
+ LOG(FATAL) << "Unimplemented";
+ // uint64_t left = position(a);
+ // uint64_t right = position(b);
+ // if (left < right) return -1;
+ // if (left == right) return 0;
+ // if (left > right) return 1;
+ }
+
+ virtual const char* Name() const
+ {
+ // Note that this name MUST NOT CHANGE across uses of this
+ // comparator with the same DB (the semantics of doing so are
+ // undefined if the database doesn't catch this first).
+ return "varint64";
+ }
+
+ virtual void FindShortestSeparator(
+ string* start,
+ const leveldb::Slice& limit) const
+ {
+ // Intentional no-op.
+ }
+
+ virtual void FindShortSuccessor(string* key) const
+ {
+ // Intentional no-op.
+ }
+};
+
+
+// TODO(benh): Use varint comparator.
+// static Varint64Comparator comparator;
+
+
+// Returns a string representing the specified position. Note that we
+// adjust the actual position by incrementing it by 1 because we
+// reserve 0 for storing the promise record (Record::Promise,
+// DEPRECATED!), or the metadata (Record::Metadata).
+static string encode(uint64_t position, bool adjust = true)
+{
+ // Adjusted stringified represenation is plus 1 of actual position.
+ position = adjust ? position + 1 : position;
+
+ // TODO(benh): Use varint encoding for VarInt64Comparator!
+ // string s;
+ // google::protobuf::io::StringOutputStream _stream(&s);
+ // google::protobuf::io::CodedOutputStream stream(&_stream);
+ // position = adjust ? position + 1 : position;
+ // stream.WriteVarint64(position);
+ // return s;
+
+ Try<string> s = strings::format("%.*d", 10, position);
+ CHECK_SOME(s);
+ return s.get();
+}
+
+
+// Returns the position as represented in the specified slice
+// (performing a decrement as necessary to determine the actual
+// position represented).
+static uint64_t decode(const leveldb::Slice& s)
+{
+ // TODO(benh): Use varint decoding for VarInt64Comparator!
+ // uint64_t position;
+ // google::protobuf::io::ArrayInputStream _stream(s.data(), s.size());
+ // google::protobuf::io::CodedInputStream stream(&_stream);
+ // bool success = stream.ReadVarint64(&position);
+ // CHECK(success);
+ // return position - 1; // Actual position is less 1 of stringified.
+ Try<uint64_t> position = numify<uint64_t>(string(s.data(), s.size()));
+ CHECK_SOME(position);
+ return position.get() - 1; // Actual position is less 1 of stringified.
+}
+
+
+LevelDBStorage::LevelDBStorage()
+ : db(NULL), first(0)
+{
+ // Nothing to see here.
+}
+
+
+LevelDBStorage::~LevelDBStorage()
+{
+ delete db; // Might be null if open failed in LevelDBStorage::recover.
+}
+
+
+Try<LevelDBStorage::State> LevelDBStorage::restore(const string& path)
+{
+ leveldb::Options options;
+ options.create_if_missing = true;
+
+ // TODO(benh): Can't use varint comparator until bug discussed at
+ // groups.google.com/group/leveldb/browse_thread/thread/17eac39168909ba7
+ // gets fixed. For now, we are using the default byte-wise
+ // comparator and *assuming* that the encoding from unsigned long to
+ // string produces a stable ordering. Checks below.
+ // options.comparator = &comparator;
+
+ const string& one = encode(1);
+ const string& two = encode(2);
+ const string& ten = encode(10);
+
+ CHECK(leveldb::BytewiseComparator()->Compare(one, two) < 0);
+ CHECK(leveldb::BytewiseComparator()->Compare(two, one) > 0);
+ CHECK(leveldb::BytewiseComparator()->Compare(one, ten) < 0);
+ CHECK(leveldb::BytewiseComparator()->Compare(ten, two) > 0);
+ CHECK(leveldb::BytewiseComparator()->Compare(ten, ten) == 0);
+
+ Stopwatch stopwatch;
+ stopwatch.start();
+
+ leveldb::Status status = leveldb::DB::Open(options, path, &db);
+
+ if (!status.ok()) {
+ // TODO(benh): Consider trying to repair the DB.
+ return Error(status.ToString());
+ }
+
+ LOG(INFO) << "Opened db in " << stopwatch.elapsed();
+
+ stopwatch.start(); // Restart the stopwatch.
+
+ // TODO(benh): Conditionally compact to avoid long recovery times?
+ db->CompactRange(NULL, NULL);
+
+ LOG(INFO) << "Compacted db in " << stopwatch.elapsed();
+
+ State state;
+ state.begin = 0;
+ state.end = 0;
+
+ // TODO(benh): Consider just reading the "promise" record (e.g.,
+ // 'encode(0, false)') and then iterating over the rest of the
+ // records and confirming that they are all indeed of type
+ // Record::Action.
+
+ stopwatch.start(); // Restart the stopwatch.
+
+ leveldb::Iterator* iterator = db->NewIterator(leveldb::ReadOptions());
+
+ LOG(INFO) << "Created db iterator in " << stopwatch.elapsed();
+
+ stopwatch.start(); // Restart the stopwatch.
+
+ iterator->SeekToFirst();
+
+ LOG(INFO) << "Seeked to beginning of db in " << stopwatch.elapsed();
+
+ stopwatch.start(); // Restart the stopwatch.
+
+ uint64_t keys = 0;
+
+ while (iterator->Valid()) {
+ keys++;
+ const leveldb::Slice& slice = iterator->value();
+
+ google::protobuf::io::ArrayInputStream stream(slice.data(), slice.size());
+
+ Record record;
+
+ if (!record.ParseFromZeroCopyStream(&stream)) {
+ return Error("Failed to deserialize record");
+ }
+
+ switch (record.type()) {
+ case Record::METADATA: {
+ CHECK(record.has_metadata());
+ state.metadata.CopyFrom(record.metadata());
+ break;
+ }
+
+ // DEPRECATED!
+ case Record::PROMISE: {
+ CHECK(record.has_promise());
+ // This replica is in old format. Set its status to VOTING
+ // since there is no catch-up logic in the old code and this
+ // replica is obviously not empty.
+ state.metadata.set_status(Metadata::VOTING);
+ state.metadata.set_promised(record.promise().proposal());
+ break;
+ }
+
+ case Record::ACTION: {
+ CHECK(record.has_action());
+ const Action& action = record.action();
+ if (action.has_learned() && action.learned()) {
+ state.learned.insert(action.position());
+ state.unlearned.erase(action.position());
+ if (action.has_type() && action.type() == Action::TRUNCATE) {
+ state.begin = std::max(state.begin, action.truncate().to());
+ }
+ } else {
+ state.learned.erase(action.position());
+ state.unlearned.insert(action.position());
+ }
+ state.end = std::max(state.end, action.position());
+ break;
+ }
+
+ default: {
+ return Error("Bad record");
+ }
+ }
+
+ iterator->Next();
+ }
+
+ LOG(INFO) << "Iterated through " << keys
+ << " keys in the db in " << stopwatch.elapsed();
+
+ // Determine the first position still in leveldb so during a
+ // truncation we can attempt to delete all positions from the first
+ // position up to the truncate position. Note that this is not the
+ // beginning position of the log, but rather the first position that
+ // remains (i.e., hasn't been deleted) in leveldb.
+ iterator->Seek(encode(0));
+
+ if (iterator->Valid()) {
+ first = decode(iterator->key());
+ }
+
+ delete iterator;
+
+ return state;
+}
+
+
+Try<Nothing> LevelDBStorage::persist(const Metadata& metadata)
+{
+ Stopwatch stopwatch;
+ stopwatch.start();
+
+ leveldb::WriteOptions options;
+ options.sync = true;
+
+ Record record;
+ record.set_type(Record::METADATA);
+ record.mutable_metadata()->CopyFrom(metadata);
+
+ string value;
+
+ if (!record.SerializeToString(&value)) {
+ return Error("Failed to serialize record");
+ }
+
+ leveldb::Status status = db->Put(options, encode(0, false), value);
+
+ if (!status.ok()) {
+ return Error(status.ToString());
+ }
+
+ LOG(INFO) << "Persisting metadata (" << value.size()
+ << " bytes) to leveldb took " << stopwatch.elapsed();
+
+ return Nothing();
+}
+
+
+Try<Nothing> LevelDBStorage::persist(const Action& action)
+{
+ Stopwatch stopwatch;
+ stopwatch.start();
+
+ Record record;
+ record.set_type(Record::ACTION);
+ record.mutable_action()->MergeFrom(action);
+
+ string value;
+
+ if (!record.SerializeToString(&value)) {
+ return Error("Failed to serialize record");
+ }
+
+ leveldb::WriteOptions options;
+ options.sync = true;
+
+ leveldb::Status status = db->Put(options, encode(action.position()), value);
+
+ if (!status.ok()) {
+ return Error(status.ToString());
+ }
+
+ LOG(INFO) << "Persisting action (" << value.size()
+ << " bytes) to leveldb took " << stopwatch.elapsed();
+
+ // Delete positions if a truncate action has been *learned*. Note
+ // that we do this in a best-effort fashion (i.e., we ignore any
+ // failures to the database since we can always try again).
+ if (action.has_type() && action.type() == Action::TRUNCATE &&
+ action.has_learned() && action.learned()) {
+ CHECK(action.has_truncate());
+
+ stopwatch.start(); // Restart the stopwatch.
+
+ // To actually perform the truncation in leveldb we need to remove
+ // all the keys that represent positions no longer in the log. We
+ // do this by attempting to delete all keys that represent the
+ // first position we know is still in leveldb up to (but
+ // excluding) the truncate position. Note that this works because
+ // the semantics of WriteBatch are such that even if the position
+ // doesn't exist (which is possible because this replica has some
+ // holes), we can attempt to delete the key that represents it and
+ // it will just ignore that key. This is *much* cheaper than
+ // actually iterating through the entire database instead (which
+ // was, for posterity, the original implementation). In addition,
+ // caching the "first" position we know is in the database is
+ // cheaper than using an iterator to determine the first position
+ // (which was, for posterity, the second implementation).
+
+ leveldb::WriteBatch batch;
+
+ // Add positions up to (but excluding) the truncate position to
+ // the batch starting at the first position still in leveldb.
+ uint64_t index = 0;
+ while ((first + index) < action.truncate().to()) {
+ batch.Delete(encode(first + index));
+ index++;
+ }
+
+ // If we added any positions, attempt to delete them!
+ if (index > 0) {
+ // We do this write asynchronously (e.g., using default options).
+ leveldb::Status status = db->Write(leveldb::WriteOptions(), &batch);
+
+ if (!status.ok()) {
+ LOG(WARNING) << "Ignoring leveldb batch delete failure: "
+ << status.ToString();
+ } else {
+ first = action.truncate().to(); // Save the new first position!
+
+ LOG(INFO) << "Deleting ~" << index
+ << " keys from leveldb took " << stopwatch.elapsed();
+ }
+ }
+ }
+
+ return Nothing();
+}
+
+
+Try<Action> LevelDBStorage::read(uint64_t position)
+{
+ Stopwatch stopwatch;
+ stopwatch.start();
+
+ leveldb::ReadOptions options;
+
+ string value;
+
+ leveldb::Status status = db->Get(options, encode(position), &value);
+
+ if (!status.ok()) {
+ return Error(status.ToString());
+ }
+
+ google::protobuf::io::ArrayInputStream stream(value.data(), value.size());
+
+ Record record;
+
+ if (!record.ParseFromZeroCopyStream(&stream)) {
+ return Error("Failed to deserialize record");
+ }
+
+ if (record.type() != Record::ACTION) {
+ return Error("Bad record");
+ }
+
+ LOG(INFO) << "Reading position from leveldb took " << stopwatch.elapsed();
+
+ return record.action();
+}
+
+} // namespace log {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/e2fe5860/src/log/leveldb.hpp
----------------------------------------------------------------------
diff --git a/src/log/leveldb.hpp b/src/log/leveldb.hpp
new file mode 100644
index 0000000..7eb51be
--- /dev/null
+++ b/src/log/leveldb.hpp
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LOG_LEVELDB_HPP__
+#define __LOG_LEVELDB_HPP__
+
+#include <leveldb/db.h>
+
+#include "log/storage.hpp"
+
+namespace mesos {
+namespace internal {
+namespace log {
+
+// Concrete implementation of the storage interface using leveldb.
+class LevelDBStorage : public Storage
+{
+public:
+ LevelDBStorage();
+ virtual ~LevelDBStorage();
+
+ virtual Try<State> restore(const std::string& path);
+ virtual Try<Nothing> persist(const Metadata& metadata);
+ virtual Try<Nothing> persist(const Action& action);
+ virtual Try<Action> read(uint64_t position);
+
+private:
+ leveldb::DB* db;
+ uint64_t first; // First position still in leveldb, used during truncation.
+};
+
+} // namespace log {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __LOG_LEVELDB_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/e2fe5860/src/log/main.cpp
----------------------------------------------------------------------
diff --git a/src/log/main.cpp b/src/log/main.cpp
index f07bd10..c37dd6f 100644
--- a/src/log/main.cpp
+++ b/src/log/main.cpp
@@ -16,119 +16,89 @@
* limitations under the License.
*/
+#include <string.h>
+
#include <iostream>
-#include <list>
#include <string>
-#include <process/process.hpp>
+#include <process/owned.hpp>
-#include <stout/check.hpp>
-#include <stout/flags.hpp>
#include <stout/foreach.hpp>
-#include <stout/none.hpp>
-#include <stout/option.hpp>
-#include <stout/os.hpp>
-
-#include "log/replica.hpp"
+#include <stout/hashmap.hpp>
-#include "logging/flags.hpp"
-#include "logging/logging.hpp"
+#include "log/tool.hpp"
+#include "log/tool/initialize.hpp"
+#include "log/tool/read.hpp"
using namespace mesos;
using namespace mesos::internal;
using namespace mesos::internal::log;
+using namespace process;
+
using std::cerr;
-using std::cout;
using std::endl;
using std::string;
+// All the registered tools.
+static hashmap<string, Owned<tool::Tool> > tools;
-void usage(const char* argv0, const flags::FlagsBase& flags)
+
+static void add(const Owned<tool::Tool>& tool)
{
- cerr << "Usage: " << os::basename(argv0).get() << " [...] path/to/log"
- << endl
- << "Supported options:" << endl
- << flags.usage();
+ tools[tool->name()] = tool;
}
-int main(int argc, char** argv)
+static void usage(const char* argv0)
{
- flags::Flags<logging::Flags> flags;
-
- Option<uint64_t> from;
- flags.add(&from,
- "from",
- "Position from which to start reading in the log");
-
- Option<uint64_t> to;
- flags.add(&to,
- "to",
- "Position from which to stop reading in the log");
-
- bool help;
- flags.add(&help,
- "help",
- "Prints this help message",
- false);
-
- Try<Nothing> load = flags.load(None(), argc, argv);
-
- if (load.isError()) {
- cerr << load.error() << endl;
- usage(argv[0], flags);
- exit(1);
- }
+ cerr << "Usage: " << argv0 << " <command> [OPTIONS]" << endl
+ << endl
+ << "Available commands:" << endl
+ << " help" << endl;
- if (help) {
- usage(argv[0], flags);
- exit(1);
+ // Get a list of available tools.
+ foreachkey (const string& name, tools) {
+ cerr << " " << name << endl;
}
+}
- process::initialize();
-
- logging::initialize(argv[0], flags);
-
- string path = argv[argc - 1];
-
- Replica replica(path);
-
- process::Future<uint64_t> begin = replica.beginning();
- process::Future<uint64_t> end = replica.ending();
-
- begin.await();
- end.await();
-
- CHECK(begin.isReady());
- CHECK(end.isReady());
- if (!from.isSome()) {
- from = begin.get();
- }
+int main(int argc, char** argv)
+{
+ // Register log tools.
+ add(Owned<tool::Tool>(new tool::Initialize()));
+ add(Owned<tool::Tool>(new tool::Read()));
- if (!to.isSome()) {
- to = end.get();
+ if (argc < 2) {
+ usage(argv[0]);
+ return 1;
}
- CHECK_SOME(from);
- CHECK_SOME(to);
-
- cerr << endl << "Attempting to read the log from "
- << from.get() << " to " << to.get() << endl << endl;
-
- process::Future<std::list<Action> > actions =
- replica.read(from.get(), to.get());
+ if (!strcmp(argv[1], "help")) {
+ if (argc == 2) {
+ usage(argv[0]);
+ return 0;
+ }
- actions.await();
+ // 'mesos-log help command' => 'mesos-log command --help'
+ argv[1] = argv[2];
+ argv[2] = (char*) "--help";
+ }
- CHECK(!actions.isFailed()) << actions.failure();
+ string command = argv[1];
- CHECK(actions.isReady());
+ if (!tools.contains(command)) {
+ cerr << "Cannot find command '" << command << "'" << endl << endl;
+ usage(argv[0]);
+ return 1;
+ }
- foreach (const Action& action, actions.get()) {
- cout << "----------------------------------------------" << endl;
- action.PrintDebugString();
+ // Execute the command.
+ Try<Nothing> execute = tools[command]->execute(argc, argv);
+ if (execute.isError()) {
+ cerr << execute.error() << endl;
+ return 1;
}
return 0;
http://git-wip-us.apache.org/repos/asf/mesos/blob/e2fe5860/src/log/replica.cpp
----------------------------------------------------------------------
diff --git a/src/log/replica.cpp b/src/log/replica.cpp
index da9310f..ec6e38c 100644
--- a/src/log/replica.cpp
+++ b/src/log/replica.cpp
@@ -16,12 +16,6 @@
* limitations under the License.
*/
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-
-#include <leveldb/comparator.h>
-#include <leveldb/db.h>
-#include <leveldb/write_batch.h>
-
#include <algorithm>
#include <process/dispatch.hpp>
@@ -32,22 +26,18 @@
#include <stout/foreach.hpp>
#include <stout/none.hpp>
#include <stout/nothing.hpp>
-#include <stout/numify.hpp>
-#include <stout/stopwatch.hpp>
+#include <stout/result.hpp>
+#include <stout/try.hpp>
#include <stout/utils.hpp>
#include "common/type_utils.hpp"
+#include "log/leveldb.hpp"
#include "log/replica.hpp"
-
-#include "logging/logging.hpp"
-
-#include "messages/log.hpp"
+#include "log/storage.hpp"
using namespace process;
-using process::wait; // Necessary on some OS's to disambiguate.
-
using std::list;
using std::set;
using std::string;
@@ -66,425 +56,6 @@ Protocol<RecoverRequest, RecoverResponse> recover;
} // namespace protocol {
-struct State
-{
- Metadata metadata; // The metadata for the replica.
- uint64_t begin; // Beginning position of the log.
- uint64_t end; // Ending position of the log.
- set<uint64_t> learned; // Positions present and learned
- set<uint64_t> unlearned; // Positions present but unlearned.
-};
-
-
-// Abstract interface for reading and writing records.
-class Storage
-{
-public:
- virtual ~Storage() {}
- virtual Try<State> restore(const string& path) = 0;
- virtual Try<Nothing> persist(const Metadata& metadata) = 0;
- virtual Try<Nothing> persist(const Action& action) = 0;
- virtual Try<Action> read(uint64_t position) = 0;
-};
-
-
-// Concrete implementation of the storage interface using leveldb.
-class LevelDBStorage : public Storage
-{
-public:
- LevelDBStorage();
- virtual ~LevelDBStorage();
-
- virtual Try<State> restore(const string& path);
- virtual Try<Nothing> persist(const Metadata& metadata);
- virtual Try<Nothing> persist(const Action& action);
- virtual Try<Action> read(uint64_t position);
-
-private:
- class Varint64Comparator : public leveldb::Comparator
- {
- public:
- virtual int Compare(
- const leveldb::Slice& a,
- const leveldb::Slice& b) const
- {
- // TODO(benh): Use varint comparator.
- LOG(FATAL) << "Unimplemented";
- // uint64_t left = position(a);
- // uint64_t right = position(b);
- // if (left < right) return -1;
- // if (left == right) return 0;
- // if (left > right) return 1;
- }
-
- virtual const char* Name() const
- {
- // Note that this name MUST NOT CHANGE across uses of this
- // comparator with the same DB (the semantics of doing so are
- // undefined if the database doesn't catch this first).
- return "varint64";
- }
-
- virtual void FindShortestSeparator(
- string* start,
- const leveldb::Slice& limit) const
- {
- // Intentional no-op.
- }
-
- virtual void FindShortSuccessor(string* key) const
- {
- // Intentional no-op.
- }
- };
-
- // Returns a string representing the specified position. Note that
- // we adjust the actual position by incrementing it by 1 because we
- // reserve 0 for storing the promise record (Record::Promise,
- // DEPRECATED!), or the metadata (Record::Metadata).
- static string encode(uint64_t position, bool adjust = true)
- {
- // Adjusted stringified represenation is plus 1 of actual position.
- position = adjust ? position + 1 : position;
-
- // TODO(benh): Use varint encoding for VarInt64Comparator!
- // string s;
- // google::protobuf::io::StringOutputStream _stream(&s);
- // google::protobuf::io::CodedOutputStream stream(&_stream);
- // position = adjust ? position + 1 : position;
- // stream.WriteVarint64(position);
- // return s;
-
- Try<string> s = strings::format("%.*d", 10, position);
- CHECK_SOME(s);
- return s.get();
- }
-
- // Returns the position as represented in the specified slice
- // (performing a decrement as necessary to determine the actual
- // position represented).
- static uint64_t decode(const leveldb::Slice& s)
- {
- // TODO(benh): Use varint decoding for VarInt64Comparator!
- // uint64_t position;
- // google::protobuf::io::ArrayInputStream _stream(s.data(), s.size());
- // google::protobuf::io::CodedInputStream stream(&_stream);
- // bool success = stream.ReadVarint64(&position);
- // CHECK(success);
- // return position - 1; // Actual position is less 1 of stringified.
- Try<uint64_t> position = numify<uint64_t>(string(s.data(), s.size()));
- CHECK_SOME(position);
- return position.get() - 1; // Actual position is less 1 of stringified.
- }
-
- // Varint64Comparator comparator; // TODO(benh): Use varint comparator.
-
- leveldb::DB* db;
-
- uint64_t first; // First position still in leveldb, used during truncation.
-};
-
-
-LevelDBStorage::LevelDBStorage()
- : db(NULL), first(0)
-{
- // Nothing to see here.
-}
-
-
-LevelDBStorage::~LevelDBStorage()
-{
- delete db; // Might be null if open failed in LevelDBStorage::recover.
-}
-
-
-Try<State> LevelDBStorage::restore(const string& path)
-{
- leveldb::Options options;
- options.create_if_missing = true;
-
- // TODO(benh): Can't use varint comparator until bug discussed at
- // groups.google.com/group/leveldb/browse_thread/thread/17eac39168909ba7
- // gets fixed. For now, we are using the default byte-wise
- // comparator and *assuming* that the encoding from unsigned long to
- // string produces a stable ordering. Checks below.
- // options.comparator = &comparator;
-
- const string& one = encode(1);
- const string& two = encode(2);
- const string& ten = encode(10);
-
- CHECK(leveldb::BytewiseComparator()->Compare(one, two) < 0);
- CHECK(leveldb::BytewiseComparator()->Compare(two, one) > 0);
- CHECK(leveldb::BytewiseComparator()->Compare(one, ten) < 0);
- CHECK(leveldb::BytewiseComparator()->Compare(ten, two) > 0);
- CHECK(leveldb::BytewiseComparator()->Compare(ten, ten) == 0);
-
- Stopwatch stopwatch;
- stopwatch.start();
-
- leveldb::Status status = leveldb::DB::Open(options, path, &db);
-
- if (!status.ok()) {
- // TODO(benh): Consider trying to repair the DB.
- return Error(status.ToString());
- }
-
- LOG(INFO) << "Opened db in " << stopwatch.elapsed();
-
- stopwatch.start(); // Restart the stopwatch.
-
- // TODO(benh): Conditionally compact to avoid long recovery times?
- db->CompactRange(NULL, NULL);
-
- LOG(INFO) << "Compacted db in " << stopwatch.elapsed();
-
- State state;
- state.begin = 0;
- state.end = 0;
-
- // TODO(benh): Consider just reading the "promise" record (e.g.,
- // 'encode(0, false)') and then iterating over the rest of the
- // records and confirming that they are all indeed of type
- // Record::Action.
-
- stopwatch.start(); // Restart the stopwatch.
-
- leveldb::Iterator* iterator = db->NewIterator(leveldb::ReadOptions());
-
- LOG(INFO) << "Created db iterator in " << stopwatch.elapsed();
-
- stopwatch.start(); // Restart the stopwatch.
-
- iterator->SeekToFirst();
-
- LOG(INFO) << "Seeked to beginning of db in " << stopwatch.elapsed();
-
- stopwatch.start(); // Restart the stopwatch.
-
- uint64_t keys = 0;
-
- while (iterator->Valid()) {
- keys++;
- const leveldb::Slice& slice = iterator->value();
-
- google::protobuf::io::ArrayInputStream stream(slice.data(), slice.size());
-
- Record record;
-
- if (!record.ParseFromZeroCopyStream(&stream)) {
- return Error("Failed to deserialize record");
- }
-
- switch (record.type()) {
- case Record::METADATA: {
- CHECK(record.has_metadata());
- state.metadata.CopyFrom(record.metadata());
- break;
- }
-
- // DEPRECATED!
- case Record::PROMISE: {
- CHECK(record.has_promise());
- // This replica is in old format. Set its status to VOTING
- // since there is no catch-up logic in the old code and this
- // replica is obviously not empty.
- state.metadata.set_status(Metadata::VOTING);
- state.metadata.set_promised(record.promise().proposal());
- break;
- }
-
- case Record::ACTION: {
- CHECK(record.has_action());
- const Action& action = record.action();
- if (action.has_learned() && action.learned()) {
- state.learned.insert(action.position());
- state.unlearned.erase(action.position());
- if (action.has_type() && action.type() == Action::TRUNCATE) {
- state.begin = std::max(state.begin, action.truncate().to());
- }
- } else {
- state.learned.erase(action.position());
- state.unlearned.insert(action.position());
- }
- state.end = std::max(state.end, action.position());
- break;
- }
-
- default: {
- return Error("Bad record");
- }
- }
-
- iterator->Next();
- }
-
- LOG(INFO) << "Iterated through " << keys
- << " keys in the db in " << stopwatch.elapsed();
-
- // Determine the first position still in leveldb so during a
- // truncation we can attempt to delete all positions from the first
- // position up to the truncate position. Note that this is not the
- // beginning position of the log, but rather the first position that
- // remains (i.e., hasn't been deleted) in leveldb.
- iterator->Seek(encode(0));
-
- if (iterator->Valid()) {
- first = decode(iterator->key());
- }
-
- delete iterator;
-
- return state;
-}
-
-
-Try<Nothing> LevelDBStorage::persist(const Metadata& metadata)
-{
- Stopwatch stopwatch;
- stopwatch.start();
-
- leveldb::WriteOptions options;
- options.sync = true;
-
- Record record;
- record.set_type(Record::METADATA);
- record.mutable_metadata()->CopyFrom(metadata);
-
- string value;
-
- if (!record.SerializeToString(&value)) {
- return Error("Failed to serialize record");
- }
-
- leveldb::Status status = db->Put(options, encode(0, false), value);
-
- if (!status.ok()) {
- return Error(status.ToString());
- }
-
- LOG(INFO) << "Persisting metadata (" << value.size()
- << " bytes) to leveldb took " << stopwatch.elapsed();
-
- return Nothing();
-}
-
-
-Try<Nothing> LevelDBStorage::persist(const Action& action)
-{
- Stopwatch stopwatch;
- stopwatch.start();
-
- Record record;
- record.set_type(Record::ACTION);
- record.mutable_action()->MergeFrom(action);
-
- string value;
-
- if (!record.SerializeToString(&value)) {
- return Error("Failed to serialize record");
- }
-
- leveldb::WriteOptions options;
- options.sync = true;
-
- leveldb::Status status = db->Put(options, encode(action.position()), value);
-
- if (!status.ok()) {
- return Error(status.ToString());
- }
-
- LOG(INFO) << "Persisting action (" << value.size()
- << " bytes) to leveldb took " << stopwatch.elapsed();
-
- // Delete positions if a truncate action has been *learned*. Note
- // that we do this in a best-effort fashion (i.e., we ignore any
- // failures to the database since we can always try again).
- if (action.has_type() && action.type() == Action::TRUNCATE &&
- action.has_learned() && action.learned()) {
- CHECK(action.has_truncate());
-
- stopwatch.start(); // Restart the stopwatch.
-
- // To actually perform the truncation in leveldb we need to remove
- // all the keys that represent positions no longer in the log. We
- // do this by attempting to delete all keys that represent the
- // first position we know is still in leveldb up to (but
- // excluding) the truncate position. Note that this works because
- // the semantics of WriteBatch are such that even if the position
- // doesn't exist (which is possible because this replica has some
- // holes), we can attempt to delete the key that represents it and
- // it will just ignore that key. This is *much* cheaper than
- // actually iterating through the entire database instead (which
- // was, for posterity, the original implementation). In addition,
- // caching the "first" position we know is in the database is
- // cheaper than using an iterator to determine the first position
- // (which was, for posterity, the second implementation).
-
- leveldb::WriteBatch batch;
-
- // Add positions up to (but excluding) the truncate position to
- // the batch starting at the first position still in leveldb.
- uint64_t index = 0;
- while ((first + index) < action.truncate().to()) {
- batch.Delete(encode(first + index));
- index++;
- }
-
- // If we added any positions, attempt to delete them!
- if (index > 0) {
- // We do this write asynchronously (e.g., using default options).
- leveldb::Status status = db->Write(leveldb::WriteOptions(), &batch);
-
- if (!status.ok()) {
- LOG(WARNING) << "Ignoring leveldb batch delete failure: "
- << status.ToString();
- } else {
- first = action.truncate().to(); // Save the new first position!
-
- LOG(INFO) << "Deleting ~" << index
- << " keys from leveldb took " << stopwatch.elapsed();
- }
- }
- }
-
- return Nothing();
-}
-
-
-Try<Action> LevelDBStorage::read(uint64_t position)
-{
- Stopwatch stopwatch;
- stopwatch.start();
-
- leveldb::ReadOptions options;
-
- string value;
-
- leveldb::Status status = db->Get(options, encode(position), &value);
-
- if (!status.ok()) {
- return Error(status.ToString());
- }
-
- google::protobuf::io::ArrayInputStream stream(value.data(), value.size());
-
- Record record;
-
- if (!record.ParseFromZeroCopyStream(&stream)) {
- return Error("Failed to deserialize record");
- }
-
- if (record.type() != Record::ACTION) {
- return Error("Bad record");
- }
-
- LOG(INFO) << "Reading position from leveldb took " << stopwatch.elapsed();
-
- return record.action();
-}
-
-
class ReplicaProcess : public ProtobufProcess<ReplicaProcess>
{
public:
@@ -1140,7 +711,7 @@ bool ReplicaProcess::persist(const Action& action)
void ReplicaProcess::restore(const string& path)
{
- Try<State> state = storage->restore(path);
+ Try<Storage::State> state = storage->restore(path);
CHECK_SOME(state) << "Failed to recover the log";
http://git-wip-us.apache.org/repos/asf/mesos/blob/e2fe5860/src/log/replica.hpp
----------------------------------------------------------------------
diff --git a/src/log/replica.hpp b/src/log/replica.hpp
index ecb126d..467d0d9 100644
--- a/src/log/replica.hpp
+++ b/src/log/replica.hpp
@@ -19,14 +19,16 @@
#ifndef __LOG_REPLICA_HPP__
#define __LOG_REPLICA_HPP__
+#include <stdint.h>
+
#include <list>
#include <set>
#include <string>
+#include <process/future.hpp>
+#include <process/pid.hpp>
#include <process/protobuf.hpp>
-#include <stout/result.hpp>
-
#include "messages/log.hpp"
namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/e2fe5860/src/log/storage.hpp
----------------------------------------------------------------------
diff --git a/src/log/storage.hpp b/src/log/storage.hpp
new file mode 100644
index 0000000..663146f
--- /dev/null
+++ b/src/log/storage.hpp
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LOG_STORAGE_HPP__
+#define __LOG_STORAGE_HPP__
+
+#include <stdint.h>
+
+#include <set>
+#include <string>
+
+#include <stout/nothing.hpp>
+#include <stout/try.hpp>
+
+#include "messages/log.hpp"
+
+namespace mesos {
+namespace internal {
+namespace log {
+
+// Abstract interface for reading and writing records.
+class Storage
+{
+public:
+ struct State
+ {
+ Metadata metadata; // The metadata for the replica.
+ uint64_t begin; // Beginning position of the log.
+ uint64_t end; // Ending position of the log.
+ std::set<uint64_t> learned; // Positions present and learned
+ std::set<uint64_t> unlearned; // Positions present but unlearned.
+ };
+
+ virtual ~Storage() {}
+
+ virtual Try<State> restore(const std::string& path) = 0;
+ virtual Try<Nothing> persist(const Metadata& metadata) = 0;
+ virtual Try<Nothing> persist(const Action& action) = 0;
+ virtual Try<Action> read(uint64_t position) = 0;
+};
+
+} // namespace log {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __LOG_STORAGE_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/e2fe5860/src/log/tool.hpp
----------------------------------------------------------------------
diff --git a/src/log/tool.hpp b/src/log/tool.hpp
new file mode 100644
index 0000000..656d3f6
--- /dev/null
+++ b/src/log/tool.hpp
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LOG_TOOL_HPP__
+#define __LOG_TOOL_HPP__
+
+#include <string>
+
+#include <stout/nothing.hpp>
+#include <stout/try.hpp>
+
+namespace mesos {
+namespace internal {
+namespace log {
+namespace tool {
+
+// Represents a tool for processing a log file.
+class Tool
+{
+public:
+ virtual ~Tool() {}
+
+ virtual std::string name() const = 0;
+
+ // Executes the tool. The tool can be configured by passing in
+ // command line arguments. If command line arguments are not
+ // specified, the default configuration will be used.
+ virtual Try<Nothing> execute(int argc, char** argv) = 0;
+};
+
+} // namespace tool {
+} // namespace log {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __LOG_TOOL_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/e2fe5860/src/log/tool/initialize.cpp
----------------------------------------------------------------------
diff --git a/src/log/tool/initialize.cpp b/src/log/tool/initialize.cpp
new file mode 100644
index 0000000..ccda7fb
--- /dev/null
+++ b/src/log/tool/initialize.cpp
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <process/process.hpp>
+#include <process/timeout.hpp>
+
+#include <stout/error.hpp>
+
+#include "log/replica.hpp"
+#include "log/tool/initialize.hpp"
+
+#include "logging/logging.hpp"
+
+using namespace process;
+
+using std::endl;
+using std::ostringstream;
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace log {
+namespace tool {
+
+Initialize::Flags::Flags()
+{
+ add(&Flags::path,
+ "path",
+ "Path to the log");
+
+ add(&Flags::timeout,
+ "timeout",
+ "Maximum time allowed for the command to finish\n"
+ "(e.g., 500ms, 1sec, etc.)");
+
+ add(&Flags::help,
+ "help",
+ "Prints the help message",
+ false);
+}
+
+
+string Initialize::usage(const string& argv0) const
+{
+ ostringstream out;
+
+ out << "Usage: " << argv0 << " " << name() << " [OPTIONS]" << endl
+ << endl
+ << "This command is used to initialize the log" << endl
+ << endl
+ << "Supported OPTIONS:" << endl
+ << flags.usage();
+
+ return out.str();
+}
+
+
+Try<Nothing> Initialize::execute(int argc, char** argv)
+{
+ // Configure the tool by parsing command line arguments.
+ if (argc > 0 && argv != NULL) {
+ Try<Nothing> load = flags.load(None(), argc, argv);
+ if (load.isError()) {
+ return Error(load.error() + "\n\n" + usage(argv[0]));
+ }
+
+ if (flags.help) {
+ return Error(usage(argv[0]));
+ }
+
+ process::initialize();
+ logging::initialize(argv[0], flags);
+ }
+
+ if (flags.path.isNone()) {
+ return Error("Missing flag: '--path'");
+ }
+
+ // Setup the timeout if specified.
+ Option<Timeout> timeout = None();
+ if (flags.timeout.isSome()) {
+ timeout = Timeout::in(flags.timeout.get());
+ }
+
+ Replica replica(flags.path.get());
+
+ // Get the current status of the replica.
+ Future<Metadata::Status> status = replica.status();
+ if (timeout.isSome()) {
+ status.await(timeout.get().remaining());
+ } else {
+ status.await();
+ }
+
+ if (status.isPending()) {
+ return Error("Timed out while getting replica status");
+ } else if (status.isDiscarded()) {
+ return Error("Failed to get status of replica (discarded future)");
+ } else if (status.isFailed()) {
+ return Error(status.failure());
+ }
+
+ // We only initialize a log if it is empty.
+ if (status.get() != Metadata::EMPTY) {
+ return Error("The log is not empty");
+ }
+
+ // Update the status of the replica to VOTING.
+ Future<bool> update = replica.update(Metadata::VOTING);
+ if (timeout.isSome()) {
+ update.await(timeout.get().remaining());
+ } else {
+ update.await();
+ }
+
+ if (update.isPending()) {
+ return Error("Timed out while setting replica status");
+ } else if (update.isDiscarded()) {
+ return Error("Failed to set replica status (discarded future)");
+ } else if (update.isFailed()) {
+ return Error(update.failure());
+ }
+
+ return Nothing();
+}
+
+} // namespace tool {
+} // namespace log {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/e2fe5860/src/log/tool/initialize.hpp
----------------------------------------------------------------------
diff --git a/src/log/tool/initialize.hpp b/src/log/tool/initialize.hpp
new file mode 100644
index 0000000..10ac269
--- /dev/null
+++ b/src/log/tool/initialize.hpp
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LOG_TOOL_INITIALIZE_HPP__
+#define __LOG_TOOL_INITIALIZE_HPP__
+
+#include <stout/duration.hpp>
+#include <stout/flags.hpp>
+#include <stout/option.hpp>
+
+#include "log/tool.hpp"
+
+#include "logging/flags.hpp"
+
+namespace mesos {
+namespace internal {
+namespace log {
+namespace tool {
+
+class Initialize : public Tool
+{
+public:
+ class Flags : public logging::Flags
+ {
+ public:
+ Flags();
+
+ Option<std::string> path;
+ Option<Duration> timeout;
+ bool help;
+ };
+
+ virtual std::string name() const { return "initialize"; }
+ virtual Try<Nothing> execute(int argc = 0, char** argv = NULL);
+
+ // Users can change the default configuration by setting this flags.
+ Flags flags;
+
+private:
+ std::string usage(const std::string& argv0) const;
+};
+
+} // namespace tool {
+} // namespace log {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __LOG_TOOL_INITIALIZE_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/e2fe5860/src/log/tool/read.cpp
----------------------------------------------------------------------
diff --git a/src/log/tool/read.cpp b/src/log/tool/read.cpp
new file mode 100644
index 0000000..ab6068d
--- /dev/null
+++ b/src/log/tool/read.cpp
@@ -0,0 +1,188 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <process/process.hpp>
+#include <process/timeout.hpp>
+
+#include <stout/error.hpp>
+
+#include "log/replica.hpp"
+#include "log/tool/read.hpp"
+
+#include "logging/logging.hpp"
+
+using namespace process;
+
+using std::cout;
+using std::endl;
+using std::list;
+using std::ostringstream;
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace log {
+namespace tool {
+
+Read::Flags::Flags()
+{
+ add(&Flags::path,
+ "path",
+ "Path to the log");
+
+ add(&Flags::from,
+ "from",
+ "Position from which to start reading the log");
+
+ add(&Flags::to,
+ "to",
+ "Position from which to stop reading the log");
+
+ add(&Flags::timeout,
+ "timeout",
+ "Maximum time allowed for the command to finish\n"
+ "(e.g., 500ms, 1sec, etc.)");
+
+ add(&Flags::help,
+ "help",
+ "Prints the help message",
+ false);
+}
+
+
+string Read::usage(const string& argv0) const
+{
+ ostringstream out;
+
+ out << "Usage: " << argv0 << " " << name() << " [OPTIONS]" << endl
+ << endl
+ << "This command is used to read the log" << endl
+ << endl
+ << "Supported OPTIONS:" << endl
+ << flags.usage();
+
+ return out.str();
+}
+
+
+Try<Nothing> Read::execute(int argc, char** argv)
+{
+ // Configure the tool by parsing command line arguments.
+ if (argc > 0 && argv != NULL) {
+ Try<Nothing> load = flags.load(None(), argc, argv);
+ if (load.isError()) {
+ return Error(load.error() + "\n\n" + usage(argv[0]));
+ }
+
+ if (flags.help) {
+ return Error(usage(argv[0]));
+ }
+
+ process::initialize();
+ logging::initialize(argv[0], flags);
+ }
+
+ if (flags.path.isNone()) {
+ return Error("Missing flag '--path'");
+ }
+
+ // Setup the timeout if specified.
+ Option<Timeout> timeout = None();
+ if (flags.timeout.isSome()) {
+ timeout = Timeout::in(flags.timeout.get());
+ }
+
+ Replica replica(flags.path.get());
+
+ // Get the beginning of the replica.
+ Future<uint64_t> begin = replica.beginning();
+ if (timeout.isSome()) {
+ begin.await(timeout.get().remaining());
+ } else {
+ begin.await();
+ }
+
+ if (begin.isPending()) {
+ return Error("Timed out while getting the beginning of the replica");
+ } else if (begin.isDiscarded()) {
+ return Error(
+ "Failed to get the beginning of the replica (discarded future)");
+ } else if (begin.isFailed()) {
+ return Error(begin.failure());
+ }
+
+ // Get the ending of the replica.
+ Future<uint64_t> end = replica.ending();
+ if (timeout.isSome()) {
+ end.await(timeout.get().remaining());
+ } else {
+ end.await();
+ }
+
+ if (end.isPending()) {
+ return Error("Timed out while getting the ending of the replica");
+ } else if (end.isDiscarded()) {
+ return Error(
+ "Failed to get the ending of the replica (discarded future)");
+ } else if (end.isFailed()) {
+ return Error(end.failure());
+ }
+
+ Option<uint64_t> from = flags.from;
+ if (from.isNone()) {
+ from = begin.get();
+ }
+
+ Option<uint64_t> to = flags.to;
+ if (to.isNone()) {
+ to = end.get();
+ }
+
+ LOG(INFO) << "Attempting to read the log from "
+ << from.get() << " to " << to.get() << endl;
+
+ Future<list<Action> > actions = replica.read(from.get(), to.get());
+ if (timeout.isSome()) {
+ actions.await(timeout.get().remaining());
+ } else {
+ actions.await();
+ }
+
+ if (actions.isPending()) {
+ return Error("Timed out while reading the replica");
+ } else if (actions.isDiscarded()) {
+ return Error("Failed to read the replica (discarded future)");
+ } else if (actions.isFailed()) {
+ return Error(actions.failure());
+ }
+
+ foreach (const Action& action, actions.get()) {
+ cout << "----------------------------------------------" << endl;
+ action.PrintDebugString();
+ }
+
+ return Nothing();
+}
+
+} // namespace tool {
+} // namespace log {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/e2fe5860/src/log/tool/read.hpp
----------------------------------------------------------------------
diff --git a/src/log/tool/read.hpp b/src/log/tool/read.hpp
new file mode 100644
index 0000000..74faec0
--- /dev/null
+++ b/src/log/tool/read.hpp
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LOG_TOOL_READ_HPP__
+#define __LOG_TOOL_READ_HPP__
+
+#include <stout/duration.hpp>
+#include <stout/flags.hpp>
+#include <stout/option.hpp>
+
+#include "log/tool.hpp"
+
+#include "logging/flags.hpp"
+
+namespace mesos {
+namespace internal {
+namespace log {
+namespace tool {
+
+class Read : public Tool
+{
+public:
+ class Flags : public logging::Flags
+ {
+ public:
+ Flags();
+
+ Option<std::string> path;
+ Option<uint64_t> from;
+ Option<uint64_t> to;
+ Option<Duration> timeout;
+ bool help;
+ };
+
+ virtual std::string name() const { return "read"; }
+ virtual Try<Nothing> execute(int argc = 0, char** argv = NULL);
+
+ // Users can change the default configuration by setting this flags.
+ Flags flags;
+
+private:
+ std::string usage(const std::string& argv0) const;
+};
+
+} // namespace tool {
+} // namespace log {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __LOG_TOOL_READ_HPP__
[02/10] git commit: Decoupled replicated log coordinator logic and
made it asynchronous.
Posted by be...@apache.org.
Decoupled replicated log coordinator logic and made it asynchronous.
This is the first patch of a series of patches that implement a
catch-up mechanism for replicated log. See the following ticket for
more details: https://issues.apache.org/jira/browse/MESOS-736.
Here is a brief summary of this patch: (Sorry for the fact that we are
not able to break it into smaller patches :().
1) Pulled the original Coordinator logic out and divides it into
several Paxos phases (see src/log/consensus.hpp). Instead of using a
blocking semantics, we implemented all the logics asynchronously.
2) In order to ensure the liveness of a catch-uper, we implemented a
retry logic by bumping the proposal number. This also requires us to
slightly change the existing replica protocol.
3) Made the "fill" operation independent of the underlying
replica. Instead, introduced a catchup (see src/log/catchup.hpp)
function to make sure the underlying local replica has learned each
write.
4) Modified the log tests to adapt to the new semantics (see (3)
above).
This is a joint work with Yan Xu.
From: Jie Yu <yu...@gmail.com>
Review: https://reviews.apache.org/r/14631
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/19ad88b7
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/19ad88b7
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/19ad88b7
Branch: refs/heads/master
Commit: 19ad88b7c45164c1272001493bdd176d80a88b91
Parents: 2ff5308
Author: Benjamin Hindman <be...@gmail.com>
Authored: Thu Jan 16 16:51:45 2014 -0800
Committer: Benjamin Hindman <be...@gmail.com>
Committed: Thu Jan 16 16:51:45 2014 -0800
----------------------------------------------------------------------
src/Makefile.am | 17 +-
src/log/catchup.cpp | 286 +++++++++++++++++
src/log/catchup.hpp | 54 ++++
src/log/consensus.cpp | 711 +++++++++++++++++++++++++++++++++++++++++++
src/log/consensus.hpp | 136 +++++++++
src/log/coordinator.cpp | 472 ++++++----------------------
src/log/coordinator.hpp | 58 +---
src/log/log.hpp | 69 ++---
src/log/network.hpp | 57 ++--
src/log/replica.cpp | 351 +++++++++++----------
src/log/replica.hpp | 25 +-
src/messages/log.proto | 91 +++---
src/tests/log_tests.cpp | 420 +++++++++++++------------
13 files changed, 1866 insertions(+), 881 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/19ad88b7/src/Makefile.am
----------------------------------------------------------------------
diff --git a/src/Makefile.am b/src/Makefile.am
index cf0c8c6..17fbf83 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -297,9 +297,20 @@ libmesos_no_3rdparty_la_LIBADD += libbuild.la
# Convenience library for building the replicated log in order to
# include the leveldb headers.
noinst_LTLIBRARIES += liblog.la
-liblog_la_SOURCES = log/coordinator.cpp log/replica.cpp
-liblog_la_SOURCES += log/coordinator.hpp log/replica.hpp log/log.hpp \
- log/network.hpp messages/log.hpp messages/log.proto
+liblog_la_SOURCES = \
+ log/catchup.cpp \
+ log/consensus.cpp \
+ log/coordinator.cpp \
+ log/replica.cpp
+liblog_la_SOURCES += \
+ log/catchup.hpp \
+ log/consensus.hpp \
+ log/coordinator.hpp \
+ log/replica.hpp \
+ log/log.hpp \
+ log/network.hpp \
+ messages/log.hpp \
+ messages/log.proto
nodist_liblog_la_SOURCES = $(LOG_PROTOS)
liblog_la_CPPFLAGS = -I../$(LEVELDB)/include $(MESOS_CPPFLAGS)
http://git-wip-us.apache.org/repos/asf/mesos/blob/19ad88b7/src/log/catchup.cpp
----------------------------------------------------------------------
diff --git a/src/log/catchup.cpp b/src/log/catchup.cpp
new file mode 100644
index 0000000..5825eae
--- /dev/null
+++ b/src/log/catchup.cpp
@@ -0,0 +1,286 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <list>
+
+#include <process/collect.hpp>
+#include <process/id.hpp>
+#include <process/process.hpp>
+
+#include <stout/lambda.hpp>
+#include <stout/stringify.hpp>
+
+#include "log/catchup.hpp"
+#include "log/consensus.hpp"
+
+#include "messages/log.hpp"
+
+using namespace process;
+
+using std::list;
+using std::set;
+
+namespace mesos {
+namespace internal {
+namespace log {
+
+class CatchUpProcess : public Process<CatchUpProcess>
+{
+public:
+ CatchUpProcess(
+ size_t _quorum,
+ const Shared<Replica>& _replica,
+ const Shared<Network>& _network,
+ uint64_t _proposal,
+ uint64_t _position)
+ : ProcessBase(ID::generate("log-catch-up")),
+ quorum(_quorum),
+ replica(_replica),
+ network(_network),
+ position(_position),
+ proposal(_proposal) {}
+
+ virtual ~CatchUpProcess() {}
+
+ Future<uint64_t> future() { return promise.future(); }
+
+protected:
+ virtual void initialize()
+ {
+ // Stop when no one cares.
+ promise.future().onDiscarded(lambda::bind(
+ static_cast<void(*)(const UPID&, bool)>(terminate), self(), true));
+
+ check();
+ }
+
+ virtual void finalize()
+ {
+ checking.discard();
+ filling.discard();
+ }
+
+private:
+ void check()
+ {
+ checking = replica->missing(position);
+ checking.onAny(defer(self(), &Self::checked));
+ }
+
+ void checked()
+ {
+ // The future 'checking' can only be discarded in 'finalize'.
+ CHECK(!checking.isDiscarded());
+
+ if (checking.isFailed()) {
+ promise.fail("Failed to get missing positions: " + checking.failure());
+ terminate(self());
+ } else if (!checking.get()) {
+ // The position has been learned.
+ promise.set(proposal);
+ terminate(self());
+ } else {
+ // Still missing, try to fill it.
+ fill();
+ }
+ }
+
+ void fill()
+ {
+ filling = log::fill(quorum, network, proposal, position);
+ filling.onAny(defer(self(), &Self::filled));
+ }
+
+ void filled()
+ {
+ // The future 'filling' can only be discarded in 'finalize'.
+ CHECK(!filling.isDiscarded());
+
+ if (filling.isFailed()) {
+ promise.fail("Failed to fill missing position: " + filling.failure());
+ terminate(self());
+ } else {
+ // Update the proposal number so that we can save a proposal
+ // number bump round trip if we need to invoke fill again.
+ CHECK(filling.get().promised() >= proposal);
+ proposal = filling.get().promised();
+
+ check();
+ }
+ }
+
+ const size_t quorum;
+ const Shared<Replica> replica;
+ const Shared<Network> network;
+ const uint64_t position;
+
+ uint64_t proposal;
+
+ process::Promise<uint64_t> promise;
+ Future<bool> checking;
+ Future<Action> filling;
+};
+
+
+// Catches-up a single log position in the local replica. This
+// function returns the highest proposal number seen. The returned
+// proposal number can be used to save extra proposal number bumps.
+static Future<uint64_t> catchup(
+ size_t quorum,
+ const Shared<Replica>& replica,
+ const Shared<Network>& network,
+ uint64_t proposal,
+ uint64_t position)
+{
+ CatchUpProcess* process =
+ new CatchUpProcess(
+ quorum,
+ replica,
+ network,
+ proposal,
+ position);
+
+ Future<uint64_t> future = process->future();
+ spawn(process, true);
+ return future;
+}
+
+
+// TODO(jieyu): Our current implementation catches-up each position in
+// the set sequentially. In the future, we may want to parallelize it
+// to improve the performance. Also, we may want to implement rate
+// control here so that we don't saturate the network or disk.
+class BulkCatchUpProcess : public Process<BulkCatchUpProcess>
+{
+public:
+ BulkCatchUpProcess(
+ size_t _quorum,
+ const Shared<Replica>& _replica,
+ const Shared<Network>& _network,
+ uint64_t _proposal,
+ const set<uint64_t>& _positions)
+ : ProcessBase(ID::generate("log-bulk-catch-up")),
+ quorum(_quorum),
+ replica(_replica),
+ network(_network),
+ positions(_positions),
+ proposal(_proposal) {}
+
+ virtual ~BulkCatchUpProcess() {}
+
+ Future<Nothing> future() { return promise.future(); }
+
+protected:
+ virtual void initialize()
+ {
+ // Stop when no one cares.
+ promise.future().onDiscarded(lambda::bind(
+ static_cast<void(*)(const UPID&, bool)>(terminate), self(), true));
+
+ // Catch-up each position in the set sequentially.
+ it = positions.begin();
+
+ catchup();
+ }
+
+ virtual void finalize()
+ {
+ catching.discard();
+ }
+
+private:
+ void catchup()
+ {
+ if (it == positions.end()) {
+ promise.set(Nothing());
+ terminate(self());
+ return;
+ }
+
+ // Store the future so that we can discard it if the user wants to
+ // cancel the catch-up operation.
+ catching = log::catchup(quorum, replica, network, proposal, *it);
+ catching.onAny(defer(self(), &Self::caughtup));
+ }
+
+ void caughtup()
+ {
+ // No one can discard the future 'catching' except the 'finalize'.
+ CHECK(!catching.isDiscarded());
+
+ if (catching.isFailed()) {
+ promise.fail(
+ "Failed to catch-up position " + stringify(*it) +
+ ": " + catching.failure());
+ terminate(self());
+ return;
+ }
+
+ ++it;
+
+ // The single position catch-up function: 'log::catchup' will
+ // return the highest proposal number seen so far. We use this
+ // proposal number for the next 'catchup' as it is highly likely
+ // that this number is high enough, saving potentially unnecessary
+ // proposal number bumps.
+ proposal = catching.get();
+
+ catchup();
+ }
+
+ const size_t quorum;
+ const Shared<Replica> replica;
+ const Shared<Network> network;
+ const set<uint64_t> positions;
+
+ uint64_t proposal;
+ set<uint64_t>::iterator it;
+
+ process::Promise<Nothing> promise;
+ Future<uint64_t> catching;
+};
+
+
+/////////////////////////////////////////////////
+// Public interfaces below.
+/////////////////////////////////////////////////
+
+
+Future<Nothing> catchup(
+ size_t quorum,
+ const Shared<Replica>& replica,
+ const Shared<Network>& network,
+ uint64_t proposal,
+ const set<uint64_t>& positions)
+{
+ BulkCatchUpProcess* process =
+ new BulkCatchUpProcess(
+ quorum,
+ replica,
+ network,
+ proposal,
+ positions);
+
+ Future<Nothing> future = process->future();
+ spawn(process, true);
+ return future;
+}
+
+} // namespace log {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/19ad88b7/src/log/catchup.hpp
----------------------------------------------------------------------
diff --git a/src/log/catchup.hpp b/src/log/catchup.hpp
new file mode 100644
index 0000000..3652830
--- /dev/null
+++ b/src/log/catchup.hpp
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LOG_CATCHUP_HPP__
+#define __LOG_CATCHUP_HPP__
+
+#include <stdint.h>
+
+#include <set>
+
+#include <process/future.hpp>
+#include <process/shared.hpp>
+
+#include <stout/nothing.hpp>
+
+#include "log/network.hpp"
+#include "log/replica.hpp"
+
+namespace mesos {
+namespace internal {
+namespace log {
+
+// Catches-up a set of log positions in the local replica. The user of
+// this function can provide a hint on the proposal number that will
+// be used for Paxos. This could potentially save us a few Paxos
+// rounds. However, if the user has no idea what proposal number to
+// use, he can just use an arbitrary proposal number (e.g., 0).
+extern process::Future<Nothing> catchup(
+ size_t quorum,
+ const process::Shared<Replica>& replica,
+ const process::Shared<Network>& network,
+ uint64_t proposal,
+ const std::set<uint64_t>& positions);
+
+} // namespace log {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __LOG_CATCHUP_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/19ad88b7/src/log/consensus.cpp
----------------------------------------------------------------------
diff --git a/src/log/consensus.cpp b/src/log/consensus.cpp
new file mode 100644
index 0000000..5eb90e7
--- /dev/null
+++ b/src/log/consensus.cpp
@@ -0,0 +1,711 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdlib.h>
+
+#include <set>
+
+#include <process/defer.hpp>
+#include <process/delay.hpp>
+#include <process/id.hpp>
+#include <process/process.hpp>
+
+#include <stout/check.hpp>
+#include <stout/duration.hpp>
+#include <stout/lambda.hpp>
+#include <stout/nothing.hpp>
+#include <stout/foreach.hpp>
+
+#include "log/consensus.hpp"
+#include "log/replica.hpp"
+
+using namespace process;
+
+using std::set;
+
+namespace mesos {
+namespace internal {
+namespace log {
+
+class ExplicitPromiseProcess : public Process<ExplicitPromiseProcess>
+{
+public:
+ ExplicitPromiseProcess(
+ size_t _quorum,
+ const Shared<Network>& _network,
+ uint64_t _proposal,
+ uint64_t _position)
+ : ProcessBase(ID::generate("log-explicit-promise")),
+ quorum(_quorum),
+ network(_network),
+ proposal(_proposal),
+ position(_position),
+ responsesReceived(0) {}
+
+ virtual ~ExplicitPromiseProcess() {}
+
+ Future<PromiseResponse> future() { return promise.future(); }
+
+protected:
+ virtual void initialize()
+ {
+ // Stop when no one cares.
+ promise.future().onDiscarded(lambda::bind(
+ static_cast<void(*)(const UPID&, bool)>(terminate), self(), true));
+
+ request.set_proposal(proposal);
+ request.set_position(position);
+
+ network->broadcast(protocol::promise, request)
+ .onAny(defer(self(), &Self::broadcasted, lambda::_1));
+ }
+
+ virtual void finalize()
+ {
+ // This process will be terminated when we get responses from a
+ // quorum of replicas. In that case, we no longer care about
+ // responses from other replicas, thus discarding them here.
+ discard(responses);
+ }
+
+private:
+ void broadcasted(const Future<set<Future<PromiseResponse> > >& future)
+ {
+ if (!future.isReady()) {
+ promise.fail(
+ future.isFailed() ?
+ "Failed to broadcast explicit promise request: " + future.failure() :
+ "Not expecting discarded future");
+ terminate(self());
+ return;
+ }
+
+ responses = future.get();
+ foreach (const Future<PromiseResponse>& response, responses) {
+ response.onReady(defer(self(), &Self::received, lambda::_1));
+ }
+ }
+
+ void received(const PromiseResponse& response)
+ {
+ responsesReceived++;
+
+ if (!response.okay()) {
+ // Failed to get the promise from a replica for this position
+ // because it has been promised to a proposer with a higher
+ // proposal number. The 'proposal' field in the response
+ // specifies the proposal number. It is found to be larger than
+ // the proposal number used in this phase.
+ if (highestNackProposal.isNone() ||
+ highestNackProposal.get() < response.proposal()) {
+ highestNackProposal = response.proposal();
+ }
+ } else if (highestNackProposal.isSome()) {
+ // We still want to wait for more potential NACK responses so we
+ // can return the highest proposal number seen but we don't care
+ // about any more ACK responses.
+ } else {
+ // The position has been promised to us so the 'proposal' field
+ // should match the proposal we sent in the request.
+ CHECK_EQ(response.proposal(), request.proposal());
+
+ if (response.has_action()) {
+ CHECK_EQ(response.action().position(), position);
+ if (response.action().has_learned() && response.action().learned()) {
+ // Received a learned action. Note that there is no checking
+ // that we get the _same_ learned action in the event we get
+ // multiple responses with learned actions, we just take the
+ // "first". In fact, there is a specific instance in which
+ // learned actions will NOT be the same! In this instance,
+ // one replica may return that the action is a learned no-op
+ // because it knows the position has been truncated while
+ // another replica (that hasn't learned the truncation yet)
+ // might return the actual action at this position. Picking
+ // either action is _correct_, since eventually we know this
+ // position will be truncated. Fun!
+ promise.set(response);
+
+ // The remaining responses will be discarded in 'finalize'.
+ terminate(self());
+ return;
+ } else if (response.action().has_performed()) {
+ // An action has already been performed in this position, we
+ // need to save the action with the highest proposal number.
+ if (highestAckAction.isNone() ||
+ (highestAckAction.get().performed() <
+ response.action().performed())) {
+ highestAckAction = response.action();
+ }
+ } else {
+ // Received a response for a position that had previously
+ // been promised to some other proposer but an action had
+ // not been performed or learned. The position is now
+ // promised to us. No need to do anything here.
+ }
+ } else {
+ // Received a response without an action associated with. This
+ // is the case where this proposer is this first one who asks
+ // promise for this log position.
+ CHECK(response.has_position());
+ CHECK_EQ(response.position(), position);
+ }
+ }
+
+ if (responsesReceived >= quorum) {
+ // A quorum of replicas have replied.
+ PromiseResponse result;
+
+ if (highestNackProposal.isSome()) {
+ result.set_okay(false);
+ result.set_proposal(highestNackProposal.get());
+ } else {
+ result.set_okay(true);
+ if (highestAckAction.isSome()) {
+ result.mutable_action()->CopyFrom(highestAckAction.get());
+ }
+ }
+
+ promise.set(result);
+ terminate(self());
+ }
+ }
+
+ const size_t quorum;
+ const Shared<Network> network;
+ const uint64_t proposal;
+ const uint64_t position;
+
+ PromiseRequest request;
+ set<Future<PromiseResponse> > responses;
+ size_t responsesReceived;
+ Option<uint64_t> highestNackProposal;
+ Option<Action> highestAckAction;
+
+ process::Promise<PromiseResponse> promise;
+};
+
+
+class ImplicitPromiseProcess : public Process<ImplicitPromiseProcess>
+{
+public:
+ ImplicitPromiseProcess(
+ size_t _quorum,
+ const Shared<Network>& _network,
+ uint64_t _proposal)
+ : ProcessBase(ID::generate("log-implicit-promise")),
+ quorum(_quorum),
+ network(_network),
+ proposal(_proposal),
+ responsesReceived(0) {}
+
+ virtual ~ImplicitPromiseProcess() {}
+
+ Future<PromiseResponse> future() { return promise.future(); }
+
+protected:
+ virtual void initialize()
+ {
+ // Stop when no one cares.
+ promise.future().onDiscarded(lambda::bind(
+ static_cast<void(*)(const UPID&, bool)>(terminate), self(), true));
+
+ request.set_proposal(proposal);
+
+ network->broadcast(protocol::promise, request)
+ .onAny(defer(self(), &Self::broadcasted, lambda::_1));
+ }
+
+ virtual void finalize()
+ {
+ // This process will be terminated when we get responses from a
+ // quorum of replicas. In that case, we no longer care about
+ // responses from other replicas, thus discarding them here.
+ discard(responses);
+ }
+
+private:
+ void broadcasted(const Future<set<Future<PromiseResponse> > >& future)
+ {
+ if (!future.isReady()) {
+ promise.fail(
+ future.isFailed() ?
+ "Failed to broadcast implicit promise request: " + future.failure() :
+ "Not expecting discarded future");
+ terminate(self());
+ return;
+ }
+
+ responses = future.get();
+ foreach (const Future<PromiseResponse>& response, responses) {
+ response.onReady(defer(self(), &Self::received, lambda::_1));
+ }
+ }
+
+ void received(const PromiseResponse& response)
+ {
+ responsesReceived++;
+
+ if (!response.okay()) {
+ // Failed to get the promise from a replica because it has
+ // promised a proposer with a higher proposal number. The
+ // 'proposal' field in the response specifies the proposal
+ // number. It is found to be larger than the proposal number
+ // used in this phase.
+ if (highestNackProposal.isNone() ||
+ highestNackProposal.get() < response.proposal()) {
+ highestNackProposal = response.proposal();
+ }
+ } else if (highestNackProposal.isSome()) {
+ // We still want to wait for more potential NACK responses so we
+ // can return the highest proposal number seen but we don't care
+ // about any more ACK responses.
+ } else {
+ CHECK(response.has_position());
+ if (highestEndPosition.isNone() ||
+ highestEndPosition.get() < response.position()) {
+ highestEndPosition = response.position();
+ }
+ }
+
+ if (responsesReceived >= quorum) {
+ // A quorum of replicas have replied.
+ PromiseResponse result;
+
+ if (highestNackProposal.isSome()) {
+ result.set_okay(false);
+ result.set_proposal(highestNackProposal.get());
+ } else {
+ CHECK_SOME(highestEndPosition);
+
+ result.set_okay(true);
+ result.set_position(highestEndPosition.get());
+ }
+
+ promise.set(result);
+ terminate(self());
+ }
+ }
+
+ const size_t quorum;
+ const Shared<Network> network;
+ const uint64_t proposal;
+
+ PromiseRequest request;
+ set<Future<PromiseResponse> > responses;
+ size_t responsesReceived;
+ Option<uint64_t> highestNackProposal;
+ Option<uint64_t> highestEndPosition;
+
+ process::Promise<PromiseResponse> promise;
+};
+
+
+class WriteProcess : public Process<WriteProcess>
+{
+public:
+ WriteProcess(
+ size_t _quorum,
+ const Shared<Network>& _network,
+ uint64_t _proposal,
+ const Action& _action)
+ : ProcessBase(ID::generate("log-write")),
+ quorum(_quorum),
+ network(_network),
+ proposal(_proposal),
+ action(_action),
+ responsesReceived(0) {}
+
+ virtual ~WriteProcess() {}
+
+ Future<WriteResponse> future() { return promise.future(); }
+
+protected:
+ virtual void initialize()
+ {
+ // Stop when no one cares.
+ promise.future().onDiscarded(lambda::bind(
+ static_cast<void(*)(const UPID&, bool)>(terminate), self(), true));
+
+ request.set_proposal(proposal);
+ request.set_position(action.position());
+ request.set_type(action.type());
+ switch (action.type()) {
+ case Action::NOP:
+ CHECK(action.has_nop());
+ request.mutable_nop();
+ break;
+ case Action::APPEND:
+ CHECK(action.has_append());
+ request.mutable_append()->CopyFrom(action.append());
+ break;
+ case Action::TRUNCATE:
+ CHECK(action.has_truncate());
+ request.mutable_truncate()->CopyFrom(action.truncate());
+ break;
+ default:
+ LOG(FATAL) << "Unknown Action::Type " << action.type();
+ }
+
+ network->broadcast(protocol::write, request)
+ .onAny(defer(self(), &Self::broadcasted, lambda::_1));
+ }
+
+ virtual void finalize()
+ {
+ // This process will be terminated when we get responses from a
+ // quorum of replicas. In that case, we no longer care about
+ // responses from other replicas, thus discarding them here.
+ discard(responses);
+ }
+
+private:
+ void broadcasted(const Future<set<Future<WriteResponse> > >& future)
+ {
+ if (!future.isReady()) {
+ promise.fail(
+ future.isFailed() ?
+ "Failed to broadcast the write request: " + future.failure() :
+ "Not expecting discarded future");
+ terminate(self());
+ return;
+ }
+
+ responses = future.get();
+ foreach (const Future<WriteResponse>& response, responses) {
+ response.onReady(defer(self(), &Self::received, lambda::_1));
+ }
+ }
+
+ void received(const WriteResponse& response)
+ {
+ CHECK_EQ(response.position(), request.position());
+
+ responsesReceived++;
+
+ if (!response.okay()) {
+ // A replica rejects the write request because this position has
+ // been promised to a proposer with a higher proposal number.
+ // The 'proposal' field in the response specifies the proposal
+ // number. It is found to be larger than the proposal number
+ // used in this phase.
+ if (highestNackProposal.isNone() ||
+ highestNackProposal.get() < response.proposal()) {
+ highestNackProposal = response.proposal();
+ }
+ }
+
+ if (responsesReceived >= quorum) {
+ // A quorum of replicas have replied.
+ WriteResponse result;
+
+ if (highestNackProposal.isSome()) {
+ result.set_okay(false);
+ result.set_proposal(highestNackProposal.get());
+ } else {
+ result.set_okay(true);
+ }
+
+ promise.set(result);
+ terminate(self());
+ }
+ }
+
+ const size_t quorum;
+ const Shared<Network> network;
+ const uint64_t proposal;
+ const Action action;
+
+ WriteRequest request;
+ set<Future<WriteResponse> > responses;
+ size_t responsesReceived;
+ Option<uint64_t> highestNackProposal;
+
+ process::Promise<WriteResponse> promise;
+};
+
+
+class FillProcess : public Process<FillProcess>
+{
+public:
+ FillProcess(
+ size_t _quorum,
+ const Shared<Network>& _network,
+ uint64_t _proposal,
+ uint64_t _position)
+ : ProcessBase(ID::generate("log-fill")),
+ quorum(_quorum),
+ network(_network),
+ position(_position),
+ proposal(_proposal) {}
+
+ virtual ~FillProcess() {}
+
+ Future<Action> future() { return promise.future(); }
+
+protected:
+ virtual void initialize()
+ {
+ // Stop when no one cares.
+ promise.future().onDiscarded(lambda::bind(
+ static_cast<void(*)(const UPID&, bool)>(terminate), self(), true));
+
+ runPromisePhase();
+ }
+
+ virtual void finalize()
+ {
+ promising.discard();
+ writing.discard();
+ }
+
+private:
+ void runPromisePhase()
+ {
+ promising = log::promise(quorum, network, proposal, position);
+ promising.onAny(defer(self(), &Self::checkPromisePhase));
+ }
+
+ void checkPromisePhase()
+ {
+ // The future 'promising' can only be discarded in 'finalize'
+ CHECK(!promising.isDiscarded());
+
+ if (promising.isFailed()) {
+ promise.fail("Explicit promise phase failed: " + promising.failure());
+ terminate(self());
+ } else {
+ const PromiseResponse& response = promising.get();
+ if (!response.okay()) {
+ // Retry with a higher proposal number.
+ retry(response.proposal());
+ } else if (response.has_action()) {
+ // A previously performed write has been found. Paxos
+ // restricts us to write the same value.
+ Action action = response.action();
+
+ CHECK_EQ(action.position(), position);
+ CHECK(action.has_type());
+ action.set_promised(proposal);
+ action.set_performed(proposal);
+
+ if (action.has_learned() && action.learned()) {
+ // If the promise phase returns a learned action, we simply
+ // learn the action by broadcasting a learned message. We
+ // don't check if a quorum of replicas acknowledge the
+ // learned message. Because of that, a catch-up replica
+ // needs to make sure that all positions it needs to recover
+ // have been learned before it can re-join the Paxos (i.e.,
+ // invoking log::catchup). Otherwise, we may not have a
+ // quorum of replicas remember an agreed value, leading to
+ // potential inconsistency in the log.
+ runLearnPhase(action);
+ } else {
+ runWritePhase(action);
+ }
+ } else {
+ // No previously performed write has been found. We can
+ // write any value. We choose to write a NOP.
+ Action action;
+ action.set_position(position);
+ action.set_promised(proposal);
+ action.set_performed(proposal);
+ action.set_type(Action::NOP);
+ action.mutable_nop();
+
+ runWritePhase(action);
+ }
+ }
+ }
+
+ void runWritePhase(const Action& action)
+ {
+ CHECK(!action.has_learned() || !action.learned());
+
+ writing = log::write(quorum, network, proposal, action);
+ writing.onAny(defer(self(), &Self::checkWritePhase, action));
+ }
+
+ void checkWritePhase(const Action& action)
+ {
+ // The future 'writing' can only be discarded in 'finalize'.
+ CHECK(!writing.isDiscarded());
+
+ if (writing.isFailed()) {
+ promise.fail("Write phase failed: " + writing.failure());
+ terminate(self());
+ } else {
+ const WriteResponse& response = writing.get();
+ if (!response.okay()) {
+ // Retry with a higher proposal number.
+ retry(response.proposal());
+ } else {
+ // The write has been accepted (and thus performed) by a
+ // quorum of replicas. A consensus has been reached.
+ Action learnedAction = action;
+ learnedAction.set_learned(true);
+
+ runLearnPhase(learnedAction);
+ }
+ }
+ }
+
+ void runLearnPhase(const Action& action)
+ {
+ CHECK(action.has_learned() && action.learned());
+
+ // We need to make sure that the learned message has been
+ // broadcasted before the fill process completes. Some users may
+ // rely on this invariant (e.g. checking if the local replica has
+ // learned the action).
+ log::learn(network, action)
+ .onAny(defer(self(), &Self::checkLearnPhase, action, lambda::_1));
+ }
+
+ void checkLearnPhase(const Action& action, const Future<Nothing>& future)
+ {
+ if (!future.isReady()) {
+ promise.fail(
+ future.isFailed() ?
+ "Write phase failed: " + future.failure() :
+ "Not expecting discarded future");
+ terminate(self());
+ } else {
+ promise.set(action);
+ terminate(self());
+ }
+ }
+
+ void retry(uint64_t highestNackProposal)
+ {
+ // See comments below.
+ static const Duration T = Milliseconds(100);
+
+ // Bump the proposal number.
+ CHECK(highestNackProposal >= proposal);
+ proposal = highestNackProposal + 1;
+
+ // Randomized back-off. Generate a random delay in [T, 2T). T has
+ // to be chosen carefully. We want T >> broadcast time such that
+ // one proposer usually times out and wins before others wake up.
+ // On the other hand, we want T to be as small as possible such
+ // that we can reduce the wait time.
+ Duration d = T * (1.0 + (double) ::random() / RAND_MAX);
+ delay(d, self(), &Self::runPromisePhase);
+ }
+
+ const size_t quorum;
+ const Shared<Network> network;
+ const uint64_t position;
+
+ uint64_t proposal;
+
+ process::Promise<Action> promise;
+ Future<PromiseResponse> promising;
+ Future<WriteResponse> writing;
+};
+
+
+/////////////////////////////////////////////////
+// Public interfaces below.
+/////////////////////////////////////////////////
+
+
+Future<PromiseResponse> promise(
+ size_t quorum,
+ const Shared<Network>& network,
+ uint64_t proposal,
+ const Option<uint64_t>& position)
+{
+ if (position.isNone()) {
+ ImplicitPromiseProcess* process =
+ new ImplicitPromiseProcess(
+ quorum,
+ network,
+ proposal);
+
+ Future<PromiseResponse> future = process->future();
+ spawn(process, true);
+ return future;
+ } else {
+ ExplicitPromiseProcess* process =
+ new ExplicitPromiseProcess(
+ quorum,
+ network,
+ proposal,
+ position.get());
+
+ Future<PromiseResponse> future = process->future();
+ spawn(process, true);
+ return future;
+ }
+}
+
+
+Future<WriteResponse> write(
+ size_t quorum,
+ const Shared<Network>& network,
+ uint64_t proposal,
+ const Action& action)
+{
+ WriteProcess* process =
+ new WriteProcess(
+ quorum,
+ network,
+ proposal,
+ action);
+
+ Future<WriteResponse> future = process->future();
+ spawn(process, true);
+ return future;
+}
+
+
+Future<Nothing> learn(const Shared<Network>& network, const Action& action)
+{
+ LearnedMessage message;
+ message.mutable_action()->CopyFrom(action);
+
+ if (!action.has_learned() || !action.learned()) {
+ message.mutable_action()->set_learned(true);
+ }
+
+ return network->broadcast(message);
+}
+
+
+Future<Action> fill(
+ size_t quorum,
+ const Shared<Network>& network,
+ uint64_t proposal,
+ uint64_t position)
+{
+ FillProcess* process =
+ new FillProcess(
+ quorum,
+ network,
+ proposal,
+ position);
+
+ Future<Action> future = process->future();
+ spawn(process, true);
+ return future;
+}
+
+} // namespace log {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/19ad88b7/src/log/consensus.hpp
----------------------------------------------------------------------
diff --git a/src/log/consensus.hpp b/src/log/consensus.hpp
new file mode 100644
index 0000000..ba41601
--- /dev/null
+++ b/src/log/consensus.hpp
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LOG_CONSENSUS_HPP__
+#define __LOG_CONSENSUS_HPP__
+
+#include <stdint.h>
+
+#include <process/future.hpp>
+#include <process/shared.hpp>
+
+#include <stout/none.hpp>
+#include <stout/nothing.hpp>
+#include <stout/option.hpp>
+
+#include "log/network.hpp"
+
+#include "messages/log.hpp"
+
+// We use Paxos consensus protocol to agree on the value of each entry
+// in the replicated log. In our system, each replica is both an
+// acceptor and a learner. There are several types of proposers in the
+// system. Coordinator is one type of proposers we use to append new
+// log entries. The 'log::fill' function below creates an internal
+// proposer each time it is called. These internal proposers are used
+// to agree on previously written entries in the log.
+
+namespace mesos {
+namespace internal {
+namespace log {
+
+// Runs the promise phase (a.k.a., the prepare phase) in Paxos. This
+// phase has two purposes. First, the proposer asks promises from a
+// quorum of replicas not to accept writes from proposers with lower
+// proposal numbers. Second, the proposer looks for potential
+// previously agreed values. Only these values can be written in the
+// next phase. This restriction is used by Paxos to make sure that if
+// a value has been agreed on for a log position, subsequent writes to
+// this log position will always have the same value. We can run the
+// promise phase either for a specified log position ("explicit"
+// promise), or for all positions that have not yet been promised to
+// any proposer ("implicit" promise). The latter is a well known
+// optimization called Multi-Paxos. If the leader is relatively
+// stable, we can skip the promise phase for future instance of the
+// protocol with the same leader.
+//
+// We re-use PromiseResponse to specify the return value of this
+// phase. In the case of explicit promise, if a learned action has
+// been found in a response, this phase succeeds immediately with the
+// 'okay' field set to true and the 'action' field set to the learned
+// action. If no learned action has been found in a quorum of
+// replicas, we first check if some of them reply Nack (i.e., they
+// refuse to give promise). If yes, we set the 'okay' field to false
+// and set the 'proposal' field to be the highest proposal number seen
+// in these Nack responses. If none of them replies Nack, we set the
+// 'okay' field to true and set the 'action' field to be the action
+// that is performed by the proposer with the highest proposal number
+// in these responses. If no action has been found in these responses,
+// we leave the 'action' field unset.
+//
+// In the case of implicit promise, we must wait until a quorum of
+// replicas have replied. If some of them reply Nack, we set the
+// 'okay' field to false and set the 'proposal' field to be the
+// highest proposal number seen in these Nack responses. If none of
+// them replies Nack, we set the 'okay' field to true and set the
+// 'position' field to be the highest position (end position) seen in
+// these responses.
+extern process::Future<PromiseResponse> promise(
+ size_t quorum,
+ const process::Shared<Network>& network,
+ uint64_t proposal,
+ const Option<uint64_t>& position = None());
+
+
+// Runs the write phase (a.k.a., the propose phase) in Paxos. In this
+// phase, the proposer broadcasts a write to replicas. This phase
+// succeeds if a quorum of replicas accept the write. A proposer
+// cannot write if it hasn't gained enough (i.e., a quorum of)
+// promises from replicas. We re-use WriteResponse to specify the
+// return value of this phase. We must wait until a quorum of replicas
+// have replied. If some of them reply Nack, we set the 'okay' field
+// to false and set the 'proposal' field to be the highest proposal
+// number seen in these Nack responses. If none of them replies Nack,
+// we set the 'okay' field to true.
+extern process::Future<WriteResponse> write(
+ size_t quorum,
+ const process::Shared<Network>& network,
+ uint64_t proposal,
+ const Action& action);
+
+
+// Runs the learn phase (a.k.a, the commit phase) in Paxos. In fact,
+// this phase is not required, but treated as an optimization. In this
+// phase, a proposer broadcasts a learned message to replicas,
+// indicating that a consensus has already been reached for the given
+// log position. No need to wait for responses from replicas. When
+// the future is ready, the learned message has been broadcasted.
+extern process::Future<Nothing> learn(
+ const process::Shared<Network>& network,
+ const Action& action);
+
+
+// Tries to reach consensus for the given log position by running a
+// full Paxos round (i.e., promise -> write -> learn). If no value has
+// been previously agreed on for the given log position, a NOP will be
+// proposed. This function will automatically retry by bumping the
+// proposal number if the specified proposal number is found to be not
+// high enough. To ensure liveness, it will inject a random delay
+// before retrying. A learned action will be returned when the
+// operation succeeds.
+extern process::Future<Action> fill(
+ size_t quorum,
+ const process::Shared<Network>& network,
+ uint64_t proposal,
+ uint64_t position);
+
+} // namespace log {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __LOG_CONSENSUS_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/19ad88b7/src/log/coordinator.cpp
----------------------------------------------------------------------
diff --git a/src/log/coordinator.cpp b/src/log/coordinator.cpp
index 6e6466f..b2ead8e 100644
--- a/src/log/coordinator.cpp
+++ b/src/log/coordinator.cpp
@@ -18,38 +18,33 @@
#include <algorithm>
-#include <process/dispatch.hpp>
-#include <process/future.hpp>
-
-#include <stout/check.hpp>
-#include <stout/duration.hpp>
#include <stout/error.hpp>
-#include <stout/foreach.hpp>
#include <stout/none.hpp>
+#include "log/catchup.hpp"
+#include "log/consensus.hpp"
#include "log/coordinator.hpp"
-#include "log/replica.hpp"
+
+#include "messages/log.hpp"
using namespace process;
-using std::list;
-using std::pair;
using std::set;
using std::string;
-
namespace mesos {
namespace internal {
namespace log {
-Coordinator::Coordinator(int _quorum,
- Replica* _replica,
- Network* _network)
- : elected(false),
- quorum(_quorum),
+Coordinator::Coordinator(
+ size_t _quorum,
+ const Shared<Replica>& _replica,
+ const Shared<Network>& _network)
+ : quorum(_quorum),
replica(_replica),
network(_network),
- id(0),
+ elected(false),
+ proposal(0),
index(0) {}
@@ -67,94 +62,76 @@ Result<uint64_t> Coordinator::elect(const Timeout& timeout)
}
// Get the highest known promise from our local replica.
- Future<uint64_t> promise = replica->promised();
+ Future<uint64_t> promised = replica->promised();
- if (!promise.await(timeout.remaining())) {
+ if (!promised.await(timeout.remaining())) {
+ promised.discard();
return None();
- } else if (promise.isFailed()) {
- return Error(promise.failure());
+ } else if (promised.isFailed()) {
+ return Error(promised.failure());
}
- CHECK(promise.isReady()) << "Not expecting a discarded future!";
-
- id = std::max(id, promise.get()) + 1; // Try the next highest!
-
- PromiseRequest request;
- request.set_id(id);
-
- // Broadcast the request to the network.
- set<Future<PromiseResponse> > futures =
- broadcast(protocol::promise, request);
-
- uint32_t okays = 0;
-
- do {
- Future<Future<PromiseResponse> > future = select(futures);
- if (future.await(timeout.remaining())) {
- CHECK(future.get().isReady());
- const PromiseResponse& response = future.get().get();
- if (!response.okay()) {
- return None(); // Lost an election, but can retry.
- } else if (response.okay()) {
- CHECK(response.has_position());
- index = std::max(index, response.position());
- okays++;
- if (okays >= quorum) {
- break;
- }
- }
- futures.erase(future.get());
- }
- } while (timeout.remaining() > Seconds(0));
+ CHECK(promised.isReady()) << "Not expecting a discarded future!";
+
+ proposal = std::max(proposal, promised.get()) + 1; // Try the next highest!
+
+ // Run the implicit promise phase.
+ Future<PromiseResponse> promising = log::promise(quorum, network, proposal);
+
+ if (!promising.await(timeout.remaining())) {
+ promising.discard();
+ return None();
+ } else if (promising.isFailed()) {
+ return Error(promising.failure());
+ }
- // Discard the remaining futures.
- discard(futures);
+ CHECK(promising.isReady()) << "Not expecting a discarded future!";
- // Either we have a quorum or we timed out.
- if (okays >= quorum) {
+ const PromiseResponse& response = promising.get();
+ if (!response.okay()) {
+ // Lost an election, but can retry.
+ proposal = response.proposal();
+ return None();
+ } else {
LOG(INFO) << "Coordinator elected, attempting to fill missing positions";
- elected = true;
- // Need to "catchup" local replica (i.e., fill in any unlearned
+ CHECK(response.has_position());
+
+ index = response.position();
+
+ // Need to "catch-up" local replica (i.e., fill in any unlearned
// and/or missing positions) so that we can do local reads.
// Usually we could do this lazily, however, a local learned
// position might have been truncated, so we actually need to
- // catchup the local replica all the way to the end of the log
+ // catch-up the local replica all the way to the end of the log
// before we can perform any up-to-date local reads.
- Future<set<uint64_t> > positions = replica->missing(index);
+ Future<set<uint64_t> > positions = replica->missing(0, index);
if (!positions.await(timeout.remaining())) {
- elected = false;
+ positions.discard();
return None();
} else if (positions.isFailed()) {
- elected = false;
return Error(positions.failure());
}
CHECK(positions.isReady()) << "Not expecting a discarded future!";
- foreach (uint64_t position, positions.get()) {
- Result<Action> result = fill(position, timeout);
- if (result.isError()) {
- elected = false;
- return Error(result.error());
- } else if (result.isNone()) {
- elected = false;
- return None();
- } else {
- CHECK_SOME(result);
- CHECK(result.get().position() == position);
- }
+ Future<Nothing> catching =
+ log::catchup(quorum, replica, network, proposal, positions.get());
+
+ if (!catching.await(timeout.remaining())) {
+ catching.discard();
+ return None();
+ } else if (catching.isFailed()) {
+ return Error(catching.failure());
}
- index += 1;
- return index - 1;
- }
+ CHECK(catching.isReady()) << "Not expecting a discarded future!";
- // Timed out ...
- LOG(INFO) << "Coordinator timed out while trying to get elected";
- return None();
+ elected = true;
+ return index++;
+ }
}
@@ -175,8 +152,8 @@ Result<uint64_t> Coordinator::append(
Action action;
action.set_position(index);
- action.set_promised(id);
- action.set_performed(id);
+ action.set_promised(proposal);
+ action.set_performed(proposal);
action.set_type(Action::APPEND);
Action::Append* append = action.mutable_append();
append->set_bytes(bytes);
@@ -184,7 +161,7 @@ Result<uint64_t> Coordinator::append(
Result<uint64_t> result = write(action, timeout);
if (result.isSome()) {
- CHECK(result.get() == index);
+ CHECK_EQ(result.get(), index);
index++;
}
@@ -202,8 +179,8 @@ Result<uint64_t> Coordinator::truncate(
Action action;
action.set_position(index);
- action.set_promised(id);
- action.set_performed(id);
+ action.set_promised(proposal);
+ action.set_performed(proposal);
action.set_type(Action::TRUNCATE);
Action::Truncate* truncate = action.mutable_truncate();
truncate->set_to(to);
@@ -211,7 +188,7 @@ Result<uint64_t> Coordinator::truncate(
Result<uint64_t> result = write(action, timeout);
if (result.isSome()) {
- CHECK(result.get() == index);
+ CHECK_EQ(result.get(), index);
index++;
}
@@ -233,309 +210,60 @@ Result<uint64_t> Coordinator::write(
CHECK(action.has_performed());
CHECK(action.has_type());
- // TODO(benh): Eliminate this special case hack?
- if (quorum == 1) {
- Result<uint64_t> result = commit(action);
- if (result.isError()) {
- return Error(result.error());
- } else if (result.isNone()) {
- return None();
- } else {
- CHECK_SOME(result);
- return action.position();
- }
- }
-
- WriteRequest request;
- request.set_id(id);
- request.set_position(action.position());
- request.set_type(action.type());
- switch (action.type()) {
- case Action::NOP:
- CHECK(action.has_nop());
- request.mutable_nop();
- break;
- case Action::APPEND:
- CHECK(action.has_append());
- request.mutable_append()->MergeFrom(action.append());
- break;
- case Action::TRUNCATE:
- CHECK(action.has_truncate());
- request.mutable_truncate()->MergeFrom(action.truncate());
- break;
- default:
- LOG(FATAL) << "Unknown Action::Type!";
- }
-
- // Broadcast the request to the network *excluding* the local replica.
- set<Future<WriteResponse> > futures =
- remotecast(protocol::write, request);
-
- uint32_t okays = 0;
-
- do {
- Future<Future<WriteResponse> > future = select(futures);
- if (future.await(timeout.remaining())) {
- CHECK(future.get().isReady());
- const WriteResponse& response = future.get().get();
- CHECK(response.id() == request.id());
- CHECK(response.position() == request.position());
- if (!response.okay()) {
- elected = false;
- return Error("Coordinator demoted");
- } else if (response.okay()) {
- if (++okays >= (quorum - 1)) { // N.B. Using (quorum - 1) here!
- // Got enough remote okays, discard the remaining futures
- // and try and commit the action locally.
- discard(futures);
- Result<uint64_t> result = commit(action);
- if (result.isError()) {
- return Error(result.error());
- } else if (result.isNone()) {
- return None();
- } else {
- CHECK_SOME(result);
- return action.position();
- }
- }
- }
- futures.erase(future.get());
- }
- } while (timeout.remaining() > Seconds(0));
-
- // Timed out ... discard remaining futures.
- LOG(INFO) << "Coordinator timed out while attempting to write "
- << Action::Type_Name(action.type())
- << " action at position " << action.position();
- discard(futures);
- return None();
-}
-
-
-Result<uint64_t> Coordinator::commit(const Action& action)
-{
- LOG(INFO) << "Coordinator attempting to commit "
- << Action::Type_Name(action.type())
- << " action at position " << action.position();
-
- CHECK(elected);
-
- WriteRequest request;
- request.set_id(id);
- request.set_position(action.position());
- request.set_learned(true); // A commit is just a learned write.
- request.set_type(action.type());
- switch (action.type()) {
- case Action::NOP:
- CHECK(action.has_nop());
- request.mutable_nop();
- break;
- case Action::APPEND:
- CHECK(action.has_append());
- request.mutable_append()->MergeFrom(action.append());
- break;
- case Action::TRUNCATE:
- CHECK(action.has_truncate());
- request.mutable_truncate()->MergeFrom(action.truncate());
- break;
- default:
- LOG(FATAL) << "Unknown Action::Type!";
- }
-
- // TODO(benh): Add a non-message based way to do this write.
- Future<WriteResponse> future = protocol::write(replica->pid(), request);
-
- // We send a write request to the *local* replica just as the
- // others: asynchronously via messages. However, rather than add the
- // complications of dealing with timeouts for local operations
- // (especially since we are trying to commit something), we make
- // things simpler and block on the response from the local replica.
- // Maybe we can let it timeout, but consider it a failure? This
- // might be sound because we don't send the learned messages ... so
- // this should be the same as if we just failed before we even do
- // the write ... a client should just retry this write later.
+ Future<WriteResponse> writing =
+ log::write(quorum, network, proposal, action);
- future.await(); // TODO(benh): Don't wait forever, see comment above.
-
- if (future.isFailed()) {
- return Error(future.failure());
+ if (!writing.await(timeout.remaining())) {
+ writing.discard();
+ return None();
+ } else if (writing.isFailed()) {
+ return Error(writing.failure());
}
- CHECK(future.isReady()) << "Not expecting a discarded future!";
-
- const WriteResponse& response = future.get();
- CHECK(response.id() == request.id());
- CHECK(response.position() == request.position());
+ CHECK(writing.isReady()) << "Not expecting a discarded future!";
+ const WriteResponse& response = writing.get();
if (!response.okay()) {
elected = false;
+ proposal = response.proposal();
return Error("Coordinator demoted");
- }
-
- // Commit successful, send a learned message to the network
- // *excluding* the local replica and return the position.
-
- LearnedMessage message;
- message.mutable_action()->MergeFrom(action);
-
- if (!action.has_learned() || !action.learned()) {
- message.mutable_action()->set_learned(true);
- }
-
- LOG(INFO) << "Telling other replicas of learned action at position "
- << action.position();
-
- remotecast(message);
-
- return action.position();
-}
-
-
-Result<Action> Coordinator::fill(uint64_t position, const Timeout& timeout)
-{
- LOG(INFO) << "Coordinator attempting to fill position "
- << position << " in the log";
-
- CHECK(elected);
-
- PromiseRequest request;
- request.set_id(id);
- request.set_position(position);
-
- // Broadcast the request to the network.
- set<Future<PromiseResponse> > futures =
- broadcast(protocol::promise, request);
-
- list<PromiseResponse> responses;
-
- do {
- Future<Future<PromiseResponse> > future = select(futures);
- if (future.await(timeout.remaining())) {
- CHECK(future.get().isReady());
- const PromiseResponse& response = future.get().get();
- CHECK(response.id() == request.id());
- if (!response.okay()) {
- elected = false;
- return Error("Coordinator demoted");
- } else if (response.okay()) {
- responses.push_back(response);
- if (responses.size() >= quorum) {
- break;
- }
- }
- futures.erase(future.get());
- }
- } while (timeout.remaining() > Seconds(0));
-
- // Discard the remaining futures.
- discard(futures);
-
- // Either have a quorum or we timed out.
- if (responses.size() >= quorum) {
- // Check the responses for a learned action, otherwise, pick the
- // action with the higest performed id or a no-op if no responses
- // include performed actions.
- Action action;
- foreach (const PromiseResponse& response, responses) {
- if (response.has_action()) {
- CHECK(response.action().position() == position);
- if (response.action().has_learned() && response.action().learned()) {
- // Received a learned action, try and commit locally. Note
- // that there is no checking that we get the _same_ learned
- // action in the event we get multiple responses with
- // learned actions, we just take the "first". In fact, there
- // is a specific instance in which learned actions will NOT
- // be the same! In this instance, one replica may return
- // that the action is a learned no-op because it knows the
- // position has been truncated while another replica (that
- // hasn't learned the truncation yet) might return the
- // actual action at this position. Picking either action is
- // _correct_, since eventually we know this position will be
- // truncated. Fun!
- Result<uint64_t> result = commit(response.action());
- if (result.isError()) {
- return Error(result.error());
- } else if (result.isNone()) {
- return None();
- } else {
- CHECK_SOME(result);
- return response.action();
- }
- } else if (response.action().has_performed() &&
- (!action.has_performed() ||
- response.action().performed() > action.performed())) {
- action = response.action();
- }
- } else {
- CHECK(response.has_position());
- CHECK(response.position() == position);
- }
+ } else {
+ // TODO(jieyu): Currently, each log operation (append or truncate)
+ // will write the same log content to the local disk twice: one
+ // from log::write() and one from log::learn(). In the future, we
+ // may want to use checksum to eliminate the duplicate disk write.
+ Future<Nothing> learning = log::learn(network, action);
+
+ // We need to make sure that learned message has been broadcasted,
+ // thus has been enqueued. Otherwise, our "missing" check below
+ // will fail sometimes due to race condition.
+ if (!learning.await(timeout.remaining())) {
+ learning.discard();
+ return None();
+ } else if (learning.isFailed()) {
+ return Error(learning.failure());
}
- // Use a no-op if no known action has been performed.
- if (!action.has_performed()) {
- action.set_position(position);
- action.set_promised(id);
- action.set_performed(id);
- action.set_type(Action::NOP);
- action.mutable_nop();
- } else {
- action.set_performed(id);
- }
+ CHECK(learning.isReady()) << "Not expecting a discarded future!";
- Result<uint64_t> result = write(action, timeout);
+ // Make sure that the local replica has learned the newly written
+ // log entry. Since messages are delivered and dispatched in order
+ // locally, we should always have the new entry learned by now.
+ Future<bool> checking = replica->missing(action.position());
- if (result.isError()) {
- return Error(result.error());
- } else if (result.isNone()) {
+ if (!checking.await(timeout.remaining())) {
+ checking.discard();
return None();
- } else {
- CHECK_SOME(result);
- return action;
+ } else if (checking.isFailed()) {
+ return Error(checking.failure());
}
- }
-
- // Timed out ...
- LOG(INFO) << "Coordinator timed out attempting to fill position "
- << position << " in the log";
- return None();
-}
+ CHECK(checking.isReady()) << "Not expecting a discarded future!";
-template <typename Req, typename Res>
-set<Future<Res> > Coordinator::broadcast(
- const Protocol<Req, Res>& protocol,
- const Req& req)
-{
- Future<set<Future<Res> > > futures =
- network->broadcast(protocol, req);
- futures.await();
- CHECK(futures.isReady());
- return futures.get();
-}
-
+ CHECK(!checking.get());
-template <typename Req, typename Res>
-set<Future<Res> > Coordinator::remotecast(
- const Protocol<Req, Res>& protocol,
- const Req& req)
-{
- set<UPID> filter;
- filter.insert(replica->pid());
- Future<set<Future<Res> > > futures =
- network->broadcast(protocol, req, filter);
- futures.await();
- CHECK(futures.isReady());
- return futures.get();
-}
-
-
-template <typename M>
-void Coordinator::remotecast(const M& m)
-{
- set<UPID> filter;
- filter.insert(replica->pid());
- network->broadcast(m, filter);
+ return action.position();
+ }
}
} // namespace log {
http://git-wip-us.apache.org/repos/asf/mesos/blob/19ad88b7/src/log/coordinator.hpp
----------------------------------------------------------------------
diff --git a/src/log/coordinator.hpp b/src/log/coordinator.hpp
index 3f6fb7c..b0ff8df 100644
--- a/src/log/coordinator.hpp
+++ b/src/log/coordinator.hpp
@@ -19,10 +19,11 @@
#ifndef __LOG_COORDINATOR_HPP__
#define __LOG_COORDINATOR_HPP__
+#include <stdint.h>
+
#include <string>
-#include <vector>
-#include <process/process.hpp>
+#include <process/shared.hpp>
#include <process/timeout.hpp>
#include <stout/result.hpp>
@@ -30,9 +31,6 @@
#include "log/network.hpp"
#include "log/replica.hpp"
-#include "messages/log.hpp"
-
-
namespace mesos {
namespace internal {
namespace log {
@@ -40,7 +38,10 @@ namespace log {
class Coordinator
{
public:
- Coordinator(int quorum, Replica* replica, Network* group);
+ Coordinator(
+ size_t _quorum,
+ const process::Shared<Replica>& _replica,
+ const process::Shared<Network>& _network);
~Coordinator();
@@ -65,45 +66,16 @@ public:
Result<uint64_t> truncate(uint64_t to, const process::Timeout& timeout);
private:
- // Helper that tries to achieve consensus of the specified action. A
- // result of none means the write failed (e.g., due to timeout), but
- // can be retried.
- Result<uint64_t> write(const Action& action, const process::Timeout& timeout);
-
- // Helper that handles commiting an action (i.e., writing to the
- // local replica and then sending out learned messages).
- Result<uint64_t> commit(const Action& action);
-
- // Helper that tries to fill a position in the log.
- Result<Action> fill(uint64_t position, const process::Timeout& timeout);
-
- // Helper that uses the specified protocol to broadcast a request to
- // our group and return a set of futures.
- template <typename Req, typename Res>
- std::set<process::Future<Res> > broadcast(
- const Protocol<Req, Res>& protocol,
- const Req& req);
-
- // Helper like broadcast, but excludes our local replica.
- template <typename Req, typename Res>
- std::set<process::Future<Res> > remotecast(
- const Protocol<Req, Res>& protocol,
- const Req& req);
-
- // Helper like remotecast but ignores any responses.
- template <typename M>
- void remotecast(const M& m);
-
- bool elected; // True if this coordinator has been elected.
-
- const uint32_t quorum; // Quorum size.
-
- Replica* replica; // Local log replica.
-
- Network* network; // Used to broadcast requests and messages to replicas.
+ Result<uint64_t> write(
+ const Action& action,
+ const process::Timeout& timeout);
- uint64_t id; // Coordinator ID.
+ const size_t quorum;
+ const process::Shared<Replica> replica;
+ const process::Shared<Network> network;
+ bool elected; // True if this coordinator has been elected.
+ uint64_t proposal; // Currently used proposal number.
uint64_t index; // Last position written in the log.
};
http://git-wip-us.apache.org/repos/asf/mesos/blob/19ad88b7/src/log/log.hpp
----------------------------------------------------------------------
diff --git a/src/log/log.hpp b/src/log/log.hpp
index 77edc7a..042f13b 100644
--- a/src/log/log.hpp
+++ b/src/log/log.hpp
@@ -23,7 +23,9 @@
#include <set>
#include <string>
+#include <process/owned.hpp>
#include <process/process.hpp>
+#include <process/shared.hpp>
#include <process/timeout.hpp>
#include <stout/check.hpp>
@@ -139,7 +141,7 @@ public:
Position ending();
private:
- Replica* replica;
+ process::Shared<Replica> replica;
};
class Writer
@@ -179,18 +181,18 @@ public:
Log(int _quorum,
const std::string& path,
const std::set<process::UPID>& pids)
- : group(NULL)
+ : group(NULL),
+ executor(NULL),
+ quorum(_quorum),
+ replica(new Replica(path))
{
GOOGLE_PROTOBUF_VERIFY_VERSION;
- quorum = _quorum;
+ // Add our own replica to the network.
+ Network* _network = new Network(pids);
+ _network->add(replica->pid());
- replica = new Replica(path);
-
- network = new Network(pids);
-
- // Don't forget to add our own replica!
- network->add(replica->pid());
+ network.reset(_network);
}
// Creates a new replicated log that assumes the specified quorum
@@ -203,36 +205,34 @@ public:
const Duration& timeout,
const std::string& znode,
const Option<zookeeper::Authentication>& auth = None())
+ : group(new zookeeper::Group(servers, timeout, znode, auth)),
+ executor(new process::Executor()),
+ quorum(_quorum),
+ replica(new Replica(path)),
+ network(new ZooKeeperNetwork(servers, timeout, znode, auth))
{
GOOGLE_PROTOBUF_VERIFY_VERSION;
- quorum = _quorum;
-
- LOG(INFO) << "Creating a new log replica";
-
- replica = new Replica(path);
-
- group = new zookeeper::Group(servers, timeout, znode, auth);
- network = new ZooKeeperNetwork(group);
-
// Need to add our replica to the ZooKeeper group!
LOG(INFO) << "Attempting to join replica to ZooKeeper group";
membership = group->join(replica->pid())
- .onFailed(executor.defer(lambda::bind(&Log::failed, this, lambda::_1)))
- .onDiscarded(executor.defer(lambda::bind(&Log::discarded, this)));
+ .onFailed(executor->defer(lambda::bind(&Log::failed, this, lambda::_1)))
+ .onDiscarded(executor->defer(lambda::bind(&Log::discarded, this)));
group->watch()
- .onReady(executor.defer(lambda::bind(&Log::watch, this, lambda::_1)))
- .onFailed(executor.defer(lambda::bind(&Log::failed, this, lambda::_1)))
- .onDiscarded(executor.defer(lambda::bind(&Log::discarded, this)));
+ .onReady(executor->defer(lambda::bind(&Log::watch, this, lambda::_1)))
+ .onFailed(executor->defer(lambda::bind(&Log::failed, this, lambda::_1)))
+ .onDiscarded(executor->defer(lambda::bind(&Log::discarded, this)));
}
~Log()
{
- delete network;
+ network.own().await();
+ replica.own().await();
+
+ delete executor;
delete group;
- delete replica;
}
// Returns a position based off of the bytes recovered from
@@ -261,14 +261,15 @@ private:
void failed(const std::string& message) const;
void discarded() const;
+ // We store a Group instance in order to continually renew the
+ // replicas membership (when using ZooKeeper).
zookeeper::Group* group;
process::Future<zookeeper::Group::Membership> membership;
- process::Executor executor;
+ process::Executor* executor;
int quorum;
-
- Replica* replica;
- Network* network;
+ process::Shared<Replica> replica;
+ process::Shared<Network> network;
};
@@ -420,14 +421,14 @@ void Log::watch(const std::set<zookeeper::Group::Membership>& memberships)
// Our replica's membership must have expired, join back up.
LOG(INFO) << "Renewing replica group membership";
membership = group->join(replica->pid())
- .onFailed(executor.defer(lambda::bind(&Log::failed, this, lambda::_1)))
- .onDiscarded(executor.defer(lambda::bind(&Log::discarded, this)));
+ .onFailed(executor->defer(lambda::bind(&Log::failed, this, lambda::_1)))
+ .onDiscarded(executor->defer(lambda::bind(&Log::discarded, this)));
}
group->watch(memberships)
- .onReady(executor.defer(lambda::bind(&Log::watch, this, lambda::_1)))
- .onFailed(executor.defer(lambda::bind(&Log::failed, this, lambda::_1)))
- .onDiscarded(executor.defer(lambda::bind(&Log::discarded, this)));
+ .onReady(executor->defer(lambda::bind(&Log::watch, this, lambda::_1)))
+ .onFailed(executor->defer(lambda::bind(&Log::failed, this, lambda::_1)))
+ .onDiscarded(executor->defer(lambda::bind(&Log::discarded, this)));
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/19ad88b7/src/log/network.hpp
----------------------------------------------------------------------
diff --git a/src/log/network.hpp b/src/log/network.hpp
index d34cf78..2b674f6 100644
--- a/src/log/network.hpp
+++ b/src/log/network.hpp
@@ -33,6 +33,7 @@
#include <stout/duration.hpp>
#include <stout/foreach.hpp>
#include <stout/lambda.hpp>
+#include <stout/nothing.hpp>
#include "logging/logging.hpp"
@@ -67,13 +68,14 @@ public:
process::Future<std::set<process::Future<Res> > > broadcast(
const Protocol<Req, Res>& protocol,
const Req& req,
- const std::set<process::UPID>& filter = std::set<process::UPID>());
+ const std::set<process::UPID>& filter = std::set<process::UPID>()) const;
- // Sends a message to each member of the network.
+ // Sends a message to each member of the network. The returned
+ // future is set when the message is broadcasted.
template <typename M>
- void broadcast(
+ process::Future<Nothing> broadcast(
const M& m,
- const std::set<process::UPID>& filter = std::set<process::UPID>());
+ const std::set<process::UPID>& filter = std::set<process::UPID>()) const;
private:
// Not copyable, not assignable.
@@ -87,11 +89,19 @@ private:
class ZooKeeperNetwork : public Network
{
public:
- ZooKeeperNetwork(zookeeper::Group* group);
+ ZooKeeperNetwork(
+ const std::string& servers,
+ const Duration& timeout,
+ const std::string& znode,
+ const Option<zookeeper::Authentication>& auth);
private:
typedef ZooKeeperNetwork This;
+ // Not copyable, not assignable.
+ ZooKeeperNetwork(const ZooKeeperNetwork&);
+ ZooKeeperNetwork& operator = (const ZooKeeperNetwork&);
+
// Helper that sets up a watch on the group.
void watch(const std::set<zookeeper::Group::Membership>& expected);
@@ -101,9 +111,13 @@ private:
// Invoked when group members data has been collected.
void collected(const process::Future<std::list<std::string> >& datas);
- zookeeper::Group* group;
- process::Executor executor;
+ zookeeper::Group group;
process::Future<std::set<zookeeper::Group::Membership> > memberships;
+
+ // NOTE: The declaration order here is important. We want to delete
+ // the 'executor' before we delete the 'group' so that we don't get
+ // spurious fatal errors when the 'group' is being deleted.
+ process::Executor executor;
};
@@ -157,7 +171,7 @@ public:
}
template <typename M>
- void broadcast(
+ Nothing broadcast(
const M& m,
const std::set<process::UPID>& filter)
{
@@ -168,6 +182,7 @@ public:
process::post(pid, m);
}
}
+ return Nothing();
}
private:
@@ -223,7 +238,7 @@ template <typename Req, typename Res>
process::Future<std::set<process::Future<Res> > > Network::broadcast(
const Protocol<Req, Res>& protocol,
const Req& req,
- const std::set<process::UPID>& filter)
+ const std::set<process::UPID>& filter) const
{
return process::dispatch(process, &NetworkProcess::broadcast<Req, Res>,
protocol, req, filter);
@@ -231,20 +246,24 @@ process::Future<std::set<process::Future<Res> > > Network::broadcast(
template <typename M>
-void Network::broadcast(
+process::Future<Nothing> Network::broadcast(
const M& m,
- const std::set<process::UPID>& filter)
+ const std::set<process::UPID>& filter) const
{
// Need to disambiguate overloaded function.
- void (NetworkProcess::*broadcast)(const M&, const std::set<process::UPID>&) =
- &NetworkProcess::broadcast<M>;
+ Nothing (NetworkProcess::*broadcast)(const M&, const std::set<process::UPID>&)
+ = &NetworkProcess::broadcast<M>;
- process::dispatch(process, broadcast, m, filter);
+ return process::dispatch(process, broadcast, m, filter);
}
-inline ZooKeeperNetwork::ZooKeeperNetwork(zookeeper::Group* _group)
- : group(_group)
+inline ZooKeeperNetwork::ZooKeeperNetwork(
+ const std::string& servers,
+ const Duration& timeout,
+ const std::string& znode,
+ const Option<zookeeper::Authentication>& auth)
+ : group(servers, timeout, znode, auth)
{
watch(std::set<zookeeper::Group::Membership>());
}
@@ -253,7 +272,7 @@ inline ZooKeeperNetwork::ZooKeeperNetwork(zookeeper::Group* _group)
inline void ZooKeeperNetwork::watch(
const std::set<zookeeper::Group::Membership>& expected)
{
- memberships = group->watch(expected);
+ memberships = group.watch(expected);
memberships
.onAny(executor.defer(lambda::bind(&This::watched, this, lambda::_1)));
}
@@ -264,7 +283,7 @@ inline void ZooKeeperNetwork::watched(
{
if (memberships.isFailed()) {
// We can't do much here, we could try creating another Group but
- // that might just continue indifinitely, so we fail early
+ // that might just continue indefinitely, so we fail early
// instead. Note that Group handles all retryable/recoverable
// ZooKeeper errors internally.
LOG(FATAL) << "Failed to watch ZooKeeper group: " << memberships.failure();
@@ -278,7 +297,7 @@ inline void ZooKeeperNetwork::watched(
std::list<process::Future<std::string> > futures;
foreach (const zookeeper::Group::Membership& membership, memberships.get()) {
- futures.push_back(group->data(membership));
+ futures.push_back(group.data(membership));
}
process::collect(futures, process::Timeout::in(Seconds(5)))
[10/10] git commit: Changed log recover implementation to only
broadcast requests when there are enough replicas in the network.
Posted by be...@apache.org.
Changed log recover implementation to only broadcast requests when
there are enough replicas in the network.
From: Jie Yu <yu...@gmail.com>
Review: https://reviews.apache.org/r/16982
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/420e30bf
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/420e30bf
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/420e30bf
Branch: refs/heads/master
Commit: 420e30bfe5ce4c8b14bdccedff21f66475d91f18
Parents: 3e33188
Author: Benjamin Hindman <be...@gmail.com>
Authored: Thu Jan 16 16:56:46 2014 -0800
Committer: Benjamin Hindman <be...@gmail.com>
Committed: Thu Jan 16 16:56:46 2014 -0800
----------------------------------------------------------------------
src/log/recover.cpp | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/420e30bf/src/log/recover.cpp
----------------------------------------------------------------------
diff --git a/src/log/recover.cpp b/src/log/recover.cpp
index 0ab8e95..0f827d7 100644
--- a/src/log/recover.cpp
+++ b/src/log/recover.cpp
@@ -148,6 +148,26 @@ private:
{
CHECK_NE(status, Metadata::VOTING);
+ // Wait until there are enough (i.e., quorum of) replicas in the
+ // network to avoid unnecessary retries.
+ network->watch(quorum, Network::GREATER_THAN_OR_EQUAL_TO)
+ .onAny(defer(self(), &Self::watched, lambda::_1));
+ }
+
+ void watched(const Future<size_t>& future)
+ {
+ if (!future.isReady()) {
+ promise.fail(
+ future.isFailed() ?
+ future.failure() :
+ "Not expecting discarded future");
+
+ terminate(self());
+ return;
+ }
+
+ CHECK_GE(future.get(), quorum);
+
// Broadcast recover request to all replicas.
network->broadcast(protocol::recover, RecoverRequest())
.onAny(defer(self(), &Self::broadcasted, lambda::_1));
[08/10] git commit: Added a watch function to watch for network size
changes.
Posted by be...@apache.org.
Added a watch function to watch for network size changes.
From: Jie Yu <yu...@gmail.com>
Review: https://reviews.apache.org/r/16064
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/c1e3b741
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/c1e3b741
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/c1e3b741
Branch: refs/heads/master
Commit: c1e3b741d2936340b6ea7170a1737d1e5d838d07
Parents: fa5d450
Author: Benjamin Hindman <be...@gmail.com>
Authored: Thu Jan 16 16:55:56 2014 -0800
Committer: Benjamin Hindman <be...@gmail.com>
Committed: Thu Jan 16 16:55:56 2014 -0800
----------------------------------------------------------------------
src/log/network.hpp | 110 +++++++++++++++++++++++++++++++++++++++++++
src/tests/log_tests.cpp | 49 +++++++++++++++++++
2 files changed, 159 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/c1e3b741/src/log/network.hpp
----------------------------------------------------------------------
diff --git a/src/log/network.hpp b/src/log/network.hpp
index 2b674f6..9c76bf8 100644
--- a/src/log/network.hpp
+++ b/src/log/network.hpp
@@ -22,6 +22,7 @@
// TODO(benh): Eventually move and associate this code with the
// libprocess protobuf code rather than keep it here.
+#include <list>
#include <set>
#include <string>
@@ -49,6 +50,16 @@ class NetworkProcess;
class Network
{
public:
+ enum WatchMode
+ {
+ EQUAL_TO,
+ NOT_EQUAL_TO,
+ LESS_THAN,
+ LESS_THAN_OR_EQUAL_TO,
+ GREATER_THAN,
+ GREATER_THAN_OR_EQUAL_TO
+ };
+
Network();
Network(const std::set<process::UPID>& pids);
virtual ~Network();
@@ -62,6 +73,14 @@ public:
// Set the PIDs that are part of this network.
void set(const std::set<process::UPID>& pids);
+ // Returns a future which gets set when the network size satisfies
+ // the constraint specified by 'size' and 'mode'. For example, if
+ // 'size' is 2 and 'mode' is GREATER_THAN, then the returned future
+ // will get set when the size of the network is greater than 2.
+ process::Future<size_t> watch(
+ size_t size,
+ WatchMode mode = NOT_EQUAL_TO) const;
+
// Sends a request to each member of the network and returns a set
// of futures that represent their responses.
template <typename Req, typename Res>
@@ -135,12 +154,18 @@ public:
{
link(pid); // Try and keep a socket open (more efficient).
pids.insert(pid);
+
+ // Update any pending watches.
+ update();
}
void remove(const process::UPID& pid)
{
// TODO(benh): unlink(pid);
pids.erase(pid);
+
+ // Update any pending watches.
+ update();
}
void set(const std::set<process::UPID>& _pids)
@@ -149,6 +174,23 @@ public:
foreach (const process::UPID& pid, _pids) {
add(pid); // Also does a link.
}
+
+ // Update any pending watches.
+ update();
+ }
+
+ process::Future<size_t> watch(size_t size, Network::WatchMode mode)
+ {
+ if (satisfied(size, mode)) {
+ return pids.size();
+ }
+
+ Watch* watch = new Watch(size, mode);
+ watches.push_back(watch);
+
+ // TODO(jieyu): Consider deleting 'watch' if the returned future
+ // is discarded by the user.
+ return watch->promise.future();
}
// Sends a request to each of the groups members and returns a set
@@ -185,12 +227,73 @@ public:
return Nothing();
}
+protected:
+ virtual void finalize()
+ {
+ foreach (Watch* watch, watches) {
+ watch->promise.fail("Network is being terminated");
+ delete watch;
+ }
+ watches.clear();
+ }
+
private:
+ struct Watch
+ {
+ Watch(size_t _size, Network::WatchMode _mode)
+ : size(_size), mode(_mode) {}
+
+ size_t size;
+ Network::WatchMode mode;
+ process::Promise<size_t> promise;
+ };
+
// Not copyable, not assignable.
NetworkProcess(const NetworkProcess&);
NetworkProcess& operator = (const NetworkProcess&);
+ // Notifies the change of the network.
+ void update()
+ {
+ const size_t size = watches.size();
+ for (size_t i = 0; i < size; i++) {
+ Watch* watch = watches.front();
+ watches.pop_front();
+
+ if (satisfied(watch->size, watch->mode)) {
+ watch->promise.set(pids.size());
+ delete watch;
+ } else {
+ watches.push_back(watch);
+ }
+ }
+ }
+
+ // Returns true if the current size of the network satisfies the
+ // constraint specified by 'size' and 'mode'.
+ bool satisfied(size_t size, Network::WatchMode mode)
+ {
+ switch (mode) {
+ case Network::EQUAL_TO:
+ return pids.size() == size;
+ case Network::NOT_EQUAL_TO:
+ return pids.size() != size;
+ case Network::LESS_THAN:
+ return pids.size() < size;
+ case Network::LESS_THAN_OR_EQUAL_TO:
+ return pids.size() <= size;
+ case Network::GREATER_THAN:
+ return pids.size() > size;
+ case Network::GREATER_THAN_OR_EQUAL_TO:
+ return pids.size() >= size;
+ default:
+ LOG(FATAL) << "Invalid watch mode";
+ break;
+ }
+ }
+
std::set<process::UPID> pids;
+ std::list<Watch*> watches;
};
@@ -234,6 +337,13 @@ inline void Network::set(const std::set<process::UPID>& pids)
}
+inline process::Future<size_t> Network::watch(
+ size_t size, Network::WatchMode mode) const
+{
+ return process::dispatch(process, &NetworkProcess::watch, size, mode);
+}
+
+
template <typename Req, typename Res>
process::Future<std::set<process::Future<Res> > > Network::broadcast(
const Protocol<Req, Res>& protocol,
http://git-wip-us.apache.org/repos/asf/mesos/blob/c1e3b741/src/tests/log_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/log_tests.cpp b/src/tests/log_tests.cpp
index f866dde..033e8e5 100644
--- a/src/tests/log_tests.cpp
+++ b/src/tests/log_tests.cpp
@@ -28,6 +28,7 @@
#include <process/gtest.hpp>
#include <process/owned.hpp>
#include <process/pid.hpp>
+#include <process/process.hpp>
#include <process/protobuf.hpp>
#include <process/shared.hpp>
@@ -68,6 +69,54 @@ using testing::Eq;
using testing::Return;
+TEST(NetworkTest, Watch)
+{
+ UPID pid1 = ProcessBase().self();
+ UPID pid2 = ProcessBase().self();
+
+ Network network;
+
+ // Test the default parameter.
+ Future<size_t> future = network.watch(1u);
+ AWAIT_READY(future);
+ EXPECT_EQ(0u, future.get());
+
+ future = network.watch(2u, Network::NOT_EQUAL_TO);
+ AWAIT_READY(future);
+ EXPECT_EQ(0u, future.get());
+
+ future = network.watch(0u, Network::GREATER_THAN_OR_EQUAL_TO);
+ AWAIT_READY(future);
+ EXPECT_EQ(0u, future.get());
+
+ future = network.watch(1u, Network::LESS_THAN);
+ AWAIT_READY(future);
+ EXPECT_EQ(0u, future.get());
+
+ network.add(pid1);
+
+ future = network.watch(1u, Network::EQUAL_TO);
+ AWAIT_READY(future);
+ EXPECT_EQ(1u, future.get());
+
+ future = network.watch(1u, Network::GREATER_THAN);
+ ASSERT_TRUE(future.isPending());
+
+ network.add(pid2);
+
+ AWAIT_READY(future);
+ EXPECT_EQ(2u, future.get());
+
+ future = network.watch(1u, Network::LESS_THAN_OR_EQUAL_TO);
+ ASSERT_TRUE(future.isPending());
+
+ network.remove(pid2);
+
+ AWAIT_READY(future);
+ EXPECT_EQ(1u, future.get());
+}
+
+
class ReplicaTest : public TemporaryDirectoryTest
{
protected:
[09/10] git commit: Only broadcast requests when there are enough
replicas in the network.
Posted by be...@apache.org.
Only broadcast requests when there are enough replicas in the network.
From: Jie Yu <yu...@gmail.com>
Review: https://reviews.apache.org/r/16065
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/3e331889
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/3e331889
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/3e331889
Branch: refs/heads/master
Commit: 3e3318892c666195f1b4ebd6357f416717454fcf
Parents: c1e3b74
Author: Benjamin Hindman <be...@gmail.com>
Authored: Thu Jan 16 16:56:20 2014 -0800
Committer: Benjamin Hindman <be...@gmail.com>
Committed: Thu Jan 16 16:56:20 2014 -0800
----------------------------------------------------------------------
src/log/consensus.cpp | 99 +++++++++++++++++++++++++++++++++++++---------
1 file changed, 81 insertions(+), 18 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/3e331889/src/log/consensus.cpp
----------------------------------------------------------------------
diff --git a/src/log/consensus.cpp b/src/log/consensus.cpp
index 5eb90e7..b89673a 100644
--- a/src/log/consensus.cpp
+++ b/src/log/consensus.cpp
@@ -68,11 +68,11 @@ protected:
promise.future().onDiscarded(lambda::bind(
static_cast<void(*)(const UPID&, bool)>(terminate), self(), true));
- request.set_proposal(proposal);
- request.set_position(position);
-
- network->broadcast(protocol::promise, request)
- .onAny(defer(self(), &Self::broadcasted, lambda::_1));
+ // Wait until there are enough (i.e., quorum of) replicas in the
+ // network. This is because if there are less than quorum number
+ // of replicas in the network, the operation will not finish.
+ network->watch(quorum, Network::GREATER_THAN_OR_EQUAL_TO)
+ .onAny(defer(self(), &Self::watched, lambda::_1));
}
virtual void finalize()
@@ -84,6 +84,27 @@ protected:
}
private:
+ void watched(const Future<size_t>& future)
+ {
+ if (!future.isReady()) {
+ promise.fail(
+ future.isFailed() ?
+ future.failure() :
+ "Not expecting discarded future");
+
+ terminate(self());
+ return;
+ }
+
+ CHECK_GE(future.get(), quorum);
+
+ request.set_proposal(proposal);
+ request.set_position(position);
+
+ network->broadcast(protocol::promise, request)
+ .onAny(defer(self(), &Self::broadcasted, lambda::_1));
+ }
+
void broadcasted(const Future<set<Future<PromiseResponse> > >& future)
{
if (!future.isReady()) {
@@ -224,10 +245,11 @@ protected:
promise.future().onDiscarded(lambda::bind(
static_cast<void(*)(const UPID&, bool)>(terminate), self(), true));
- request.set_proposal(proposal);
-
- network->broadcast(protocol::promise, request)
- .onAny(defer(self(), &Self::broadcasted, lambda::_1));
+ // Wait until there are enough (i.e., quorum of) replicas in the
+ // network. This is because if there are less than quorum number
+ // of replicas in the network, the operation will not finish.
+ network->watch(quorum, Network::GREATER_THAN_OR_EQUAL_TO)
+ .onAny(defer(self(), &Self::watched, lambda::_1));
}
virtual void finalize()
@@ -239,6 +261,26 @@ protected:
}
private:
+ void watched(const Future<size_t>& future)
+ {
+ if (!future.isReady()) {
+ promise.fail(
+ future.isFailed() ?
+ future.failure() :
+ "Not expecting discarded future");
+
+ terminate(self());
+ return;
+ }
+
+ CHECK_GE(future.get(), quorum);
+
+ request.set_proposal(proposal);
+
+ network->broadcast(protocol::promise, request)
+ .onAny(defer(self(), &Self::broadcasted, lambda::_1));
+ }
+
void broadcasted(const Future<set<Future<PromiseResponse> > >& future)
{
if (!future.isReady()) {
@@ -341,6 +383,36 @@ protected:
promise.future().onDiscarded(lambda::bind(
static_cast<void(*)(const UPID&, bool)>(terminate), self(), true));
+ // Wait until there are enough (i.e., quorum of) replicas in the
+ // network. This is because if there are less than quorum number
+ // of replicas in the network, the operation will not finish.
+ network->watch(quorum, Network::GREATER_THAN_OR_EQUAL_TO)
+ .onAny(defer(self(), &Self::watched, lambda::_1));
+ }
+
+ virtual void finalize()
+ {
+ // This process will be terminated when we get responses from a
+ // quorum of replicas. In that case, we no longer care about
+ // responses from other replicas, thus discarding them here.
+ discard(responses);
+ }
+
+private:
+ void watched(const Future<size_t>& future)
+ {
+ if (!future.isReady()) {
+ promise.fail(
+ future.isFailed() ?
+ future.failure() :
+ "Not expecting discarded future");
+
+ terminate(self());
+ return;
+ }
+
+ CHECK_GE(future.get(), quorum);
+
request.set_proposal(proposal);
request.set_position(action.position());
request.set_type(action.type());
@@ -365,15 +437,6 @@ protected:
.onAny(defer(self(), &Self::broadcasted, lambda::_1));
}
- virtual void finalize()
- {
- // This process will be terminated when we get responses from a
- // quorum of replicas. In that case, we no longer care about
- // responses from other replicas, thus discarding them here.
- discard(responses);
- }
-
-private:
void broadcasted(const Future<set<Future<WriteResponse> > >& future)
{
if (!future.isReady()) {
[03/10] git commit: Libprocessify the coordinator.
Posted by be...@apache.org.
Libprocessify the coordinator.
From: Jie Yu <yu...@gmail.com>
Review: https://reviews.apache.org/r/14902
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/6ea7c14e
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/6ea7c14e
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/6ea7c14e
Branch: refs/heads/master
Commit: 6ea7c14e633feaa372d5565eaf6c9b57aff12b73
Parents: 19ad88b
Author: Benjamin Hindman <be...@gmail.com>
Authored: Thu Jan 16 16:53:10 2014 -0800
Committer: Benjamin Hindman <be...@gmail.com>
Committed: Thu Jan 16 16:53:10 2014 -0800
----------------------------------------------------------------------
src/log/coordinator.cpp | 572 ++++++++++++++++++++++++++++++++-----------
src/log/coordinator.hpp | 16 +-
2 files changed, 439 insertions(+), 149 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/6ea7c14e/src/log/coordinator.cpp
----------------------------------------------------------------------
diff --git a/src/log/coordinator.cpp b/src/log/coordinator.cpp
index b2ead8e..21f2865 100644
--- a/src/log/coordinator.cpp
+++ b/src/log/coordinator.cpp
@@ -18,6 +18,12 @@
#include <algorithm>
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+#include <process/future.hpp>
+#include <process/id.hpp>
+#include <process/process.hpp>
+
#include <stout/error.hpp>
#include <stout/none.hpp>
@@ -36,67 +42,178 @@ namespace mesos {
namespace internal {
namespace log {
-Coordinator::Coordinator(
- size_t _quorum,
- const Shared<Replica>& _replica,
- const Shared<Network>& _network)
- : quorum(_quorum),
- replica(_replica),
- network(_network),
- elected(false),
- proposal(0),
- index(0) {}
+class CoordinatorProcess : public Process<CoordinatorProcess>
+{
+public:
+ CoordinatorProcess(
+ size_t _quorum,
+ const Shared<Replica>& _replica,
+ const Shared<Network>& _network)
+ : ProcessBase(ID::generate("log-coordinator")),
+ quorum(_quorum),
+ replica(_replica),
+ network(_network),
+ state(INITIAL),
+ proposal(0),
+ index(0) {}
+
+ virtual ~CoordinatorProcess() {}
+
+ // Handles coordinator election. Returns the last committed log
+ // position if the operation succeeds. Returns none if the election
+ // is not successful, but can be retried.
+ Future<Option<uint64_t> > elect();
+
+ // Handles coordinator demotion. Returns the last committed log
+ // position if the operation succeeds.
+ Future<uint64_t> demote();
+
+ // Appends the specified bytes to the end of the log. Returns the
+ // position of the appended entry if the operation succeeds.
+ Future<uint64_t> append(const string& bytes);
+
+ // Removes all log entries preceding the log entry at the given
+ // position (to). Returns the position at which the truncate
+ // operation is written if the operation succeeds.
+ Future<uint64_t> truncate(uint64_t to);
+
+protected:
+ virtual void finalize()
+ {
+ electing.discard();
+ writing.discard();
+ }
+private:
+ /////////////////////////////////
+ // Election related functions. //
+ /////////////////////////////////
+
+ Future<uint64_t> getLastProposal();
+ Future<Nothing> updateProposal(uint64_t promised);
+ Future<PromiseResponse> runPromisePhase();
+ Future<Option<uint64_t> > checkPromisePhase(const PromiseResponse& response);
+ Future<set<uint64_t> > getMissingPositions();
+ Future<Nothing> catchupMissingPositions(const set<uint64_t>& positions);
+ Future<Option<uint64_t> > updateIndexAfterElected();
+ void electingFinished(const Option<uint64_t>& position);
+ void electingFailed();
+ void electingAborted();
+
+ /////////////////////////////////
+ // Writing related functions. //
+ /////////////////////////////////
+
+ Future<uint64_t> write(const Action& action);
+ Future<WriteResponse> runWritePhase(const Action& action);
+ Future<Nothing> checkWritePhase(const WriteResponse& response);
+ Future<Nothing> runLearnPhase(const Action& action);
+ Future<bool> checkLearnPhase(const Action& action);
+ Future<uint64_t> updateIndexAfterWritten(bool missing);
+ void writingFinished();
+ void writingFailed();
+ void writingAborted();
+
+ const size_t quorum;
+ const Shared<Replica> replica;
+ const Shared<Network> network;
+
+ // The current state of the coordinator. A coordinator needs to be
+ // elected first to perform append and truncate operations. If one
+ // tries to do an append or a truncate while the coordinator is not
+ // elected, a failed future will be returned immediately. A
+ // coordinator does not declare itself as elected until it wins the
+ // election and has filled all existing positions. A coordinator is
+ // put in electing state after it decides to go for an election and
+ // before it is elected.
+ enum {
+ INITIAL,
+ ELECTING,
+ ELECTED,
+ WRITING,
+ } state;
+
+ // The current proposal number used by this coordinator.
+ uint64_t proposal;
+
+ // The position to which the next entry will be written.
+ uint64_t index;
+
+ Future<Option<uint64_t> > electing;
+ Future<uint64_t> writing;
+};
+
+
+/////////////////////////////////////////////////
+// Handles elect/demote in CoordinatorProcess.
+/////////////////////////////////////////////////
+
+
+Future<Option<uint64_t> > CoordinatorProcess::elect()
+{
+ if (state == ELECTING) {
+ return Future<Option<uint64_t> >::failed(
+ "Coordinator already being elected");
+ } else if (state == ELECTED) {
+ return Future<Option<uint64_t> >::failed(
+ "Coordinator already elected");
+ } else if (state == WRITING) {
+ return Future<Option<uint64_t> >::failed(
+ "Coordinator already elected, and is currently writing");
+ }
-Coordinator::~Coordinator() {}
+ CHECK_EQ(state, INITIAL);
+ state = ELECTING;
-Result<uint64_t> Coordinator::elect(const Timeout& timeout)
-{
- LOG(INFO) << "Coordinator attempting to get elected within "
- << timeout.remaining();
+ electing = getLastProposal()
+ .then(defer(self(), &Self::updateProposal, lambda::_1))
+ .then(defer(self(), &Self::runPromisePhase))
+ .then(defer(self(), &Self::checkPromisePhase, lambda::_1))
+ .onReady(defer(self(), &Self::electingFinished, lambda::_1))
+ .onFailed(defer(self(), &Self::electingFailed))
+ .onDiscarded(defer(self(), &Self::electingAborted));
- if (elected) {
- // TODO(benh): No-op instead of error?
- return Error("Coordinator already elected");
- }
+ return electing;
+}
- // Get the highest known promise from our local replica.
- Future<uint64_t> promised = replica->promised();
- if (!promised.await(timeout.remaining())) {
- promised.discard();
- return None();
- } else if (promised.isFailed()) {
- return Error(promised.failure());
- }
+Future<uint64_t> CoordinatorProcess::getLastProposal()
+{
+ return replica->promised();
+}
- CHECK(promised.isReady()) << "Not expecting a discarded future!";
- proposal = std::max(proposal, promised.get()) + 1; // Try the next highest!
+Future<Nothing> CoordinatorProcess::updateProposal(uint64_t promised)
+{
+ // It is possible that we have already tried an election and lost.
+ // We save the proposal number used in the last election in field
+ // 'proposal', and will try at least the proposal number we had
+ // before or greater in the next election.
+ proposal = std::max(proposal, promised) + 1;
+ return Nothing();
+}
- // Run the implicit promise phase.
- Future<PromiseResponse> promising = log::promise(quorum, network, proposal);
- if (!promising.await(timeout.remaining())) {
- promising.discard();
- return None();
- } else if (promising.isFailed()) {
- return Error(promising.failure());
- }
+Future<PromiseResponse> CoordinatorProcess::runPromisePhase()
+{
+ return log::promise(quorum, network, proposal);
+}
- CHECK(promising.isReady()) << "Not expecting a discarded future!";
- const PromiseResponse& response = promising.get();
+Future<Option<uint64_t> > CoordinatorProcess::checkPromisePhase(
+ const PromiseResponse& response)
+{
if (!response.okay()) {
- // Lost an election, but can retry.
+ // Lost an election, but can be retried. We save the proposal
+ // number here so that most likely we will have a high enough
+ // proposal number when we retry.
+ CHECK_LE(proposal, response.proposal());
proposal = response.proposal();
+
return None();
} else {
- LOG(INFO) << "Coordinator elected, attempting to fill missing positions";
-
CHECK(response.has_position());
-
index = response.position();
// Need to "catch-up" local replica (i.e., fill in any unlearned
@@ -105,49 +222,90 @@ Result<uint64_t> Coordinator::elect(const Timeout& timeout)
// position might have been truncated, so we actually need to
// catch-up the local replica all the way to the end of the log
// before we can perform any up-to-date local reads.
+ return getMissingPositions()
+ .then(defer(self(), &Self::catchupMissingPositions, lambda::_1))
+ .then(defer(self(), &Self::updateIndexAfterElected));
+ }
+}
- Future<set<uint64_t> > positions = replica->missing(0, index);
- if (!positions.await(timeout.remaining())) {
- positions.discard();
- return None();
- } else if (positions.isFailed()) {
- return Error(positions.failure());
- }
+Future<set<uint64_t> > CoordinatorProcess::getMissingPositions()
+{
+ return replica->missing(0, index);
+}
- CHECK(positions.isReady()) << "Not expecting a discarded future!";
- Future<Nothing> catching =
- log::catchup(quorum, replica, network, proposal, positions.get());
+Future<Nothing> CoordinatorProcess::catchupMissingPositions(
+ const set<uint64_t>& positions)
+{
+ LOG(INFO) << "Coordinator attemping to fill missing position";
- if (!catching.await(timeout.remaining())) {
- catching.discard();
- return None();
- } else if (catching.isFailed()) {
- return Error(catching.failure());
- }
+ return log::catchup(quorum, replica, network, proposal, positions);
+}
+
+
+Future<Option<uint64_t> > CoordinatorProcess::updateIndexAfterElected()
+{
+ return Option<uint64_t>(index++);
+}
- CHECK(catching.isReady()) << "Not expecting a discarded future!";
- elected = true;
- return index++;
+void CoordinatorProcess::electingFinished(const Option<uint64_t>& position)
+{
+ CHECK_EQ(state, ELECTING);
+
+ if (position.isNone()) {
+ state = INITIAL;
+ } else {
+ state = ELECTED;
}
}
-Result<uint64_t> Coordinator::demote()
+void CoordinatorProcess::electingFailed()
+{
+ CHECK_EQ(state, ELECTING);
+ state = INITIAL;
+}
+
+
+void CoordinatorProcess::electingAborted()
{
- elected = false;
+ CHECK_EQ(state, ELECTING);
+ state = INITIAL;
+}
+
+
+Future<uint64_t> CoordinatorProcess::demote()
+{
+ if (state == INITIAL) {
+ return Future<uint64_t>::failed("Coordinator is not elected");
+ } else if (state == ELECTING) {
+ return Future<uint64_t>::failed("Coordinator is being elected");
+ } else if (state == WRITING) {
+ return Future<uint64_t>::failed("Coordinator is currently writing");
+ }
+
+ CHECK_EQ(state, ELECTED);
+
+ state = INITIAL;
return index - 1;
}
-Result<uint64_t> Coordinator::append(
- const string& bytes,
- const Timeout& timeout)
+/////////////////////////////////////////////////
+// Handles write in CoordinatorProcess.
+/////////////////////////////////////////////////
+
+
+Future<uint64_t> CoordinatorProcess::append(const string& bytes)
{
- if (!elected) {
- return Error("Coordinator not elected");
+ if (state == INITIAL) {
+ return Future<uint64_t>::failed("Coordinator is not elected");
+ } else if (state == ELECTING) {
+ return Future<uint64_t>::failed("Coordinator is being elected");
+ } else if (state == WRITING) {
+ return Future<uint64_t>::failed("Coordinator is currently writing");
}
Action action;
@@ -158,23 +316,18 @@ Result<uint64_t> Coordinator::append(
Action::Append* append = action.mutable_append();
append->set_bytes(bytes);
- Result<uint64_t> result = write(action, timeout);
-
- if (result.isSome()) {
- CHECK_EQ(result.get(), index);
- index++;
- }
-
- return result;
+ return write(action);
}
-Result<uint64_t> Coordinator::truncate(
- uint64_t to,
- const Timeout& timeout)
+Future<uint64_t> CoordinatorProcess::truncate(uint64_t to)
{
- if (!elected) {
- return Error("Coordinator not elected");
+ if (state == INITIAL) {
+ return Future<uint64_t>::failed("Coordinator is not elected");
+ } else if (state == ELECTING) {
+ return Future<uint64_t>::failed("Coordinator is being elected");
+ } else if (state == WRITING) {
+ return Future<uint64_t>::failed("Coordinator is currently writing");
}
Action action;
@@ -185,84 +338,227 @@ Result<uint64_t> Coordinator::truncate(
Action::Truncate* truncate = action.mutable_truncate();
truncate->set_to(to);
- Result<uint64_t> result = write(action, timeout);
-
- if (result.isSome()) {
- CHECK_EQ(result.get(), index);
- index++;
- }
-
- return result;
+ return write(action);
}
-Result<uint64_t> Coordinator::write(
- const Action& action,
- const Timeout& timeout)
+Future<uint64_t> CoordinatorProcess::write(const Action& action)
{
LOG(INFO) << "Coordinator attempting to write "
<< Action::Type_Name(action.type())
- << " action at position " << action.position()
- << " within " << timeout.remaining();
+ << " action at position " << action.position();
- CHECK(elected);
+ CHECK_EQ(state, ELECTED);
+ CHECK(action.has_performed() && action.has_type());
- CHECK(action.has_performed());
- CHECK(action.has_type());
+ state = WRITING;
- Future<WriteResponse> writing =
- log::write(quorum, network, proposal, action);
+ writing = runWritePhase(action)
+ .then(defer(self(), &Self::checkWritePhase, lambda::_1))
+ .then(defer(self(), &Self::runLearnPhase, action))
+ .then(defer(self(), &Self::checkLearnPhase, action))
+ .then(defer(self(), &Self::updateIndexAfterWritten, lambda::_1))
+ .onReady(defer(self(), &Self::writingFinished))
+ .onFailed(defer(self(), &Self::writingFailed))
+ .onDiscarded(defer(self(), &Self::writingAborted));
- if (!writing.await(timeout.remaining())) {
- writing.discard();
- return None();
- } else if (writing.isFailed()) {
- return Error(writing.failure());
- }
+ return writing;
+}
- CHECK(writing.isReady()) << "Not expecting a discarded future!";
- const WriteResponse& response = writing.get();
- if (!response.okay()) {
- elected = false;
+Future<WriteResponse> CoordinatorProcess::runWritePhase(const Action& action)
+{
+ return log::write(quorum, network, proposal, action);
+}
+
+
+Future<Nothing> CoordinatorProcess::checkWritePhase(
+ const WriteResponse& response)
+{
+ if (!response.okay()) {
+ // Received a NACK. Save the proposal number.
+ CHECK_LE(proposal, response.proposal());
proposal = response.proposal();
- return Error("Coordinator demoted");
+
+ return Future<Nothing>::failed("Coordinator demoted");
} else {
- // TODO(jieyu): Currently, each log operation (append or truncate)
- // will write the same log content to the local disk twice: one
- // from log::write() and one from log::learn(). In the future, we
- // may want to use checksum to eliminate the duplicate disk write.
- Future<Nothing> learning = log::learn(network, action);
-
- // We need to make sure that learned message has been broadcasted,
- // thus has been enqueued. Otherwise, our "missing" check below
- // will fail sometimes due to race condition.
- if (!learning.await(timeout.remaining())) {
- learning.discard();
- return None();
- } else if (learning.isFailed()) {
- return Error(learning.failure());
- }
+ return Nothing();
+ }
+}
- CHECK(learning.isReady()) << "Not expecting a discarded future!";
- // Make sure that the local replica has learned the newly written
- // log entry. Since messages are delivered and dispatched in order
- // locally, we should always have the new entry learned by now.
- Future<bool> checking = replica->missing(action.position());
+Future<Nothing> CoordinatorProcess::runLearnPhase(const Action& action)
+{
+ return log::learn(network, action);
+}
+
+
+Future<bool> CoordinatorProcess::checkLearnPhase(const Action& action)
+{
+ // Make sure that the local replica has learned the newly written
+ // log entry. Since messages are delivered and dispatched in order
+ // locally, we should always have the new entry learned by now.
+ return replica->missing(action.position());
+}
+
+
+Future<uint64_t> CoordinatorProcess::updateIndexAfterWritten(bool missing)
+{
+ CHECK(!missing) << "Not expecting local replica to be missing position "
+ << index << " after the writing is done";
+
+ return index++;
+}
+
+
+void CoordinatorProcess::writingFinished()
+{
+ CHECK_EQ(state, WRITING);
+ state = ELECTED;
+}
+
+
+void CoordinatorProcess::writingFailed()
+{
+ CHECK_EQ(state, WRITING);
+ state = INITIAL;
+}
+
+
+void CoordinatorProcess::writingAborted()
+{
+ CHECK_EQ(state, WRITING);
+ state = ELECTED;
+}
+
+
+/////////////////////////////////////////////////
+// Coordinator implementation.
+/////////////////////////////////////////////////
+
+
+Coordinator::Coordinator(
+ size_t quorum,
+ const Shared<Replica>& replica,
+ const Shared<Network>& network)
+{
+ process = new CoordinatorProcess(quorum, replica, network);
+ spawn(process);
+}
+
+
+Coordinator::~Coordinator()
+{
+ terminate(process);
+ process::wait(process);
+ delete process;
+}
+
+
+Result<uint64_t> Coordinator::elect(const Timeout& timeout)
+{
+ LOG(INFO) << "Coordinator attempting to get elected within "
+ << timeout.remaining();
+
+ Future<Option<uint64_t> > electing =
+ dispatch(process, &CoordinatorProcess::elect);
+
+ electing.await(timeout.remaining());
+
+ CHECK(!electing.isDiscarded());
+
+ if (electing.isPending()) {
+ LOG(INFO) << "Coordinator timed out while trying to get elected";
+
+ electing.discard();
+ return None();
+ } else if (electing.isFailed()) {
+ LOG(ERROR) << "Coordinator failed to get elected: "
+ << electing.failure();
+
+ return Error(electing.failure());
+ } else {
+ if (electing.get().isNone()) {
+ LOG(INFO) << "Coordinator lost an election, but can be retried";
- if (!checking.await(timeout.remaining())) {
- checking.discard();
return None();
- } else if (checking.isFailed()) {
- return Error(checking.failure());
+ } else {
+ LOG(INFO) << "Coordinator elected with current position "
+ << electing.get().get();
+
+ return electing.get().get();
}
+ }
+}
+
+
+Result<uint64_t> Coordinator::demote()
+{
+ Future<uint64_t> demoting =
+ dispatch(process, &CoordinatorProcess::demote);
+
+ demoting.await(); // TODO(jieyu): Use a timeout.
+
+ CHECK(!demoting.isDiscarded());
+
+ if (demoting.isFailed()) {
+ return Error(demoting.failure());
+ } else {
+ return demoting.get();
+ }
+}
+
+
+Result<uint64_t> Coordinator::append(
+ const string& bytes,
+ const Timeout& timeout)
+{
+ Future<uint64_t> appending =
+ dispatch(process, &CoordinatorProcess::append, bytes);
+
+ appending.await(timeout.remaining());
- CHECK(checking.isReady()) << "Not expecting a discarded future!";
+ CHECK(!appending.isDiscarded());
- CHECK(!checking.get());
+ if (appending.isPending()) {
+ LOG(INFO) << "Coordinator timed out while trying to append";
- return action.position();
+ appending.discard();
+ return None();
+ } else if (appending.isFailed()) {
+ LOG(ERROR) << "Coordinator failed to append the log: "
+ << appending.failure();
+
+ return Error(appending.failure());
+ } else {
+ return appending.get();
+ }
+}
+
+
+Result<uint64_t> Coordinator::truncate(
+ uint64_t to,
+ const Timeout& timeout)
+{
+ Future<uint64_t> truncating =
+ dispatch(process, &CoordinatorProcess::truncate, to);
+
+ truncating.await(timeout.remaining());
+
+ CHECK(!truncating.isDiscarded());
+
+ if (truncating.isPending()) {
+ LOG(INFO) << "Coordinator timed out while trying to truncate";
+
+ truncating.discard();
+ return None();
+ } else if (truncating.isFailed()) {
+ LOG(ERROR) << "Coordinator failed to truncate the log: "
+ << truncating.failure();
+
+ return Error(truncating.failure());
+ } else {
+ return truncating.get();
}
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/6ea7c14e/src/log/coordinator.hpp
----------------------------------------------------------------------
diff --git a/src/log/coordinator.hpp b/src/log/coordinator.hpp
index b0ff8df..43cb530 100644
--- a/src/log/coordinator.hpp
+++ b/src/log/coordinator.hpp
@@ -35,6 +35,10 @@ namespace mesos {
namespace internal {
namespace log {
+// Forward declaration.
+class CoordinatorProcess;
+
+
class Coordinator
{
public:
@@ -66,17 +70,7 @@ public:
Result<uint64_t> truncate(uint64_t to, const process::Timeout& timeout);
private:
- Result<uint64_t> write(
- const Action& action,
- const process::Timeout& timeout);
-
- const size_t quorum;
- const process::Shared<Replica> replica;
- const process::Shared<Network> network;
-
- bool elected; // True if this coordinator has been elected.
- uint64_t proposal; // Currently used proposal number.
- uint64_t index; // Last position written in the log.
+ CoordinatorProcess* process;
};
} // namespace log {
[07/10] git commit: Adjusted log tests to use the new log tool.
Posted by be...@apache.org.
Adjusted log tests to use the new log tool.
From: Jie Yu <yu...@gmail.com>
Review: https://reviews.apache.org/r/16946
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/fa5d450c
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/fa5d450c
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/fa5d450c
Branch: refs/heads/master
Commit: fa5d450c2355031296c5486b87cb3b5dcc68e470
Parents: e2fe586
Author: Benjamin Hindman <be...@gmail.com>
Authored: Thu Jan 16 16:55:36 2014 -0800
Committer: Benjamin Hindman <be...@gmail.com>
Committed: Thu Jan 16 16:55:36 2014 -0800
----------------------------------------------------------------------
src/tests/log_tests.cpp | 223 ++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 219 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/fa5d450c/src/tests/log_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/log_tests.cpp b/src/tests/log_tests.cpp
index aeebfb7..f866dde 100644
--- a/src/tests/log_tests.cpp
+++ b/src/tests/log_tests.cpp
@@ -43,9 +43,15 @@
#include "log/network.hpp"
#include "log/recover.hpp"
#include "log/replica.hpp"
+#include "log/tool/initialize.hpp"
+#include "tests/environment.hpp"
#include "tests/utils.hpp"
+#ifdef MESOS_HAS_JAVA
+#include "tests/zookeeper.hpp"
+#endif
+
using namespace mesos;
using namespace mesos::internal;
using namespace mesos::internal::log;
@@ -62,12 +68,19 @@ using testing::Eq;
using testing::Return;
-class ReplicaTest : public TemporaryDirectoryTest {};
+class ReplicaTest : public TemporaryDirectoryTest
+{
+protected:
+ // For initializing the log.
+ tool::Initialize initializer;
+};
TEST_F(ReplicaTest, Promise)
{
const string path = os::getcwd() + "/.log";
+ initializer.flags.path = path;
+ initializer.execute();
Replica replica(path);
@@ -118,6 +131,8 @@ TEST_F(ReplicaTest, Promise)
TEST_F(ReplicaTest, Append)
{
const string path = os::getcwd() + "/.log";
+ initializer.flags.path = path;
+ initializer.execute();
Replica replica(path);
@@ -177,6 +192,8 @@ TEST_F(ReplicaTest, Append)
TEST_F(ReplicaTest, Restore)
{
const string path = os::getcwd() + "/.log";
+ initializer.flags.path = path;
+ initializer.execute();
Replica replica1(path);
@@ -257,13 +274,23 @@ TEST_F(ReplicaTest, Restore)
}
-class CoordinatorTest : public TemporaryDirectoryTest {};
+class CoordinatorTest : public TemporaryDirectoryTest
+{
+protected:
+ // For initializing the log.
+ tool::Initialize initializer;
+};
TEST_F(CoordinatorTest, Elect)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
Shared<Replica> replica1(new Replica(path1));
Shared<Replica> replica2(new Replica(path2));
@@ -297,7 +324,12 @@ TEST_F(CoordinatorTest, Elect)
TEST_F(CoordinatorTest, AppendRead)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
Shared<Replica> replica1(new Replica(path1));
Shared<Replica> replica2(new Replica(path2));
@@ -341,7 +373,12 @@ TEST_F(CoordinatorTest, AppendRead)
TEST_F(CoordinatorTest, AppendReadError)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
Shared<Replica> replica1(new Replica(path1));
Shared<Replica> replica2(new Replica(path2));
@@ -382,6 +419,8 @@ TEST_F(CoordinatorTest, AppendReadError)
TEST_F(CoordinatorTest, ElectNoQuorum)
{
const string path = os::getcwd() + "/.log";
+ initializer.flags.path = path;
+ initializer.execute();
Shared<Replica> replica(new Replica(path));
@@ -408,7 +447,12 @@ TEST_F(CoordinatorTest, ElectNoQuorum)
TEST_F(CoordinatorTest, AppendNoQuorum)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
Shared<Replica> replica1(new Replica(path1));
Shared<Replica> replica2(new Replica(path2));
@@ -448,7 +492,12 @@ TEST_F(CoordinatorTest, AppendNoQuorum)
TEST_F(CoordinatorTest, Failover)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
Shared<Replica> replica1(new Replica(path1));
Shared<Replica> replica2(new Replica(path2));
@@ -503,7 +552,12 @@ TEST_F(CoordinatorTest, Failover)
TEST_F(CoordinatorTest, Demoted)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
Shared<Replica> replica1(new Replica(path1));
Shared<Replica> replica2(new Replica(path2));
@@ -571,8 +625,16 @@ TEST_F(CoordinatorTest, Demoted)
TEST_F(CoordinatorTest, Fill)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
+
const string path3 = os::getcwd() + "/.log3";
+ initializer.flags.path = path3;
+ initializer.execute();
Shared<Replica> replica1(new Replica(path1));
Shared<Replica> replica2(new Replica(path2));
@@ -637,8 +699,16 @@ TEST_F(CoordinatorTest, Fill)
TEST_F(CoordinatorTest, NotLearnedFill)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
+
const string path3 = os::getcwd() + "/.log3";
+ initializer.flags.path = path3;
+ initializer.execute();
Shared<Replica> replica1(new Replica(path1));
Shared<Replica> replica2(new Replica(path2));
@@ -707,7 +777,12 @@ TEST_F(CoordinatorTest, NotLearnedFill)
TEST_F(CoordinatorTest, MultipleAppends)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
Shared<Replica> replica1(new Replica(path1));
Shared<Replica> replica2(new Replica(path2));
@@ -749,8 +824,16 @@ TEST_F(CoordinatorTest, MultipleAppends)
TEST_F(CoordinatorTest, MultipleAppendsNotLearnedFill)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
+
const string path3 = os::getcwd() + "/.log3";
+ initializer.flags.path = path3;
+ initializer.execute();
Shared<Replica> replica1(new Replica(path1));
Shared<Replica> replica2(new Replica(path2));
@@ -817,7 +900,12 @@ TEST_F(CoordinatorTest, MultipleAppendsNotLearnedFill)
TEST_F(CoordinatorTest, Truncate)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
Shared<Replica> replica1(new Replica(path1));
Shared<Replica> replica2(new Replica(path2));
@@ -871,8 +959,16 @@ TEST_F(CoordinatorTest, Truncate)
TEST_F(CoordinatorTest, TruncateNotLearnedFill)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
+
const string path3 = os::getcwd() + "/.log3";
+ initializer.flags.path = path3;
+ initializer.execute();
Shared<Replica> replica1(new Replica(path1));
Shared<Replica> replica2(new Replica(path2));
@@ -951,8 +1047,16 @@ TEST_F(CoordinatorTest, TruncateNotLearnedFill)
TEST_F(CoordinatorTest, TruncateLearnedFill)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
+
const string path3 = os::getcwd() + "/.log3";
+ initializer.flags.path = path3;
+ initializer.execute();
Shared<Replica> replica1(new Replica(path1));
Shared<Replica> replica2(new Replica(path2));
@@ -1024,15 +1128,29 @@ TEST_F(CoordinatorTest, TruncateLearnedFill)
}
-class RecoverTest : public TemporaryDirectoryTest {};
+class RecoverTest : public TemporaryDirectoryTest
+{
+protected:
+ // For initializing the log.
+ tool::Initialize initializer;
+};
// Two logs both need recovery compete with each other.
TEST_F(RecoverTest, RacingCatchup)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
+
const string path3 = os::getcwd() + "/.log3";
+ initializer.flags.path = path3;
+ initializer.execute();
+
const string path4 = os::getcwd() + "/.log4";
const string path5 = os::getcwd() + "/.log5";
@@ -1121,13 +1239,23 @@ TEST_F(RecoverTest, RacingCatchup)
}
-class LogTest : public TemporaryDirectoryTest {};
+class LogTest : public TemporaryDirectoryTest
+{
+protected:
+ // For initializing the log.
+ tool::Initialize initializer;
+};
TEST_F(LogTest, WriteRead)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
Replica replica1(path1);
@@ -1158,7 +1286,12 @@ TEST_F(LogTest, WriteRead)
TEST_F(LogTest, Position)
{
const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
Replica replica1(path1);
@@ -1178,6 +1311,88 @@ TEST_F(LogTest, Position)
}
+#ifdef MESOS_HAS_JAVA
+// TODO(jieyu): We copy the code from TemporaryDirectoryTest here
+// because we cannot inherit from two test fixtures. In this future,
+// we need a way to compose multiple test fixtures together.
+class LogZooKeeperTest : public ZooKeeperTest
+{
+protected:
+ virtual void SetUp()
+ {
+ ZooKeeperTest::SetUp();
+
+ // Save the current working directory.
+ cwd = os::getcwd();
+
+ // Create a temporary directory for the test.
+ Try<string> directory = environment->mkdtemp();
+
+ ASSERT_SOME(directory) << "Failed to mkdtemp";
+
+ sandbox = directory.get();
+
+ LOG(INFO) << "Using temporary directory '" << sandbox.get() << "'";
+
+ // Run the test out of the temporary directory we created.
+ ASSERT_TRUE(os::chdir(sandbox.get()))
+ << "Failed to chdir into '" << sandbox.get() << "'";
+ }
+
+ virtual void TearDown()
+ {
+ // Return to previous working directory and cleanup the sandbox.
+ ASSERT_TRUE(os::chdir(cwd));
+
+ if (sandbox.isSome()) {
+ ASSERT_SOME(os::rmdir(sandbox.get()));
+ }
+ }
+
+ // For initializing the log.
+ tool::Initialize initializer;
+
+private:
+ string cwd;
+ Option<string> sandbox;
+};
+
+
+TEST_F(LogZooKeeperTest, WriteRead)
+{
+ const string path1 = os::getcwd() + "/.log1";
+ initializer.flags.path = path1;
+ initializer.execute();
+
+ const string path2 = os::getcwd() + "/.log2";
+ initializer.flags.path = path2;
+ initializer.execute();
+
+ string servers = server->connectString();
+
+ Log log1(2, path1, servers, NO_TIMEOUT, "/log/", None());
+ Log log2(2, path2, servers, NO_TIMEOUT, "/log/", None());
+
+ Log::Writer writer(&log2, Seconds(10));
+
+ Result<Log::Position> position =
+ writer.append("hello world", Timeout::in(Seconds(10)));
+
+ ASSERT_SOME(position);
+
+ Log::Reader reader(&log2);
+
+ Result<list<Log::Entry> > entries =
+ reader.read(position.get(), position.get(), Timeout::in(Seconds(10)));
+
+ ASSERT_SOME(entries);
+ ASSERT_EQ(1u, entries.get().size());
+ EXPECT_EQ(position.get(), entries.get().front().position);
+ EXPECT_EQ("hello world", entries.get().front().data);
+}
+#endif // MESOS_HAS_JAVA
+
+
TEST_F(CoordinatorTest, RacingElect) {}
TEST_F(CoordinatorTest, FillNoQuorum) {}
[04/10] Added log recovery support.
Posted by be...@apache.org.
http://git-wip-us.apache.org/repos/asf/mesos/blob/f9b60c4c/src/log/replica.cpp
----------------------------------------------------------------------
diff --git a/src/log/replica.cpp b/src/log/replica.cpp
index 032180a..da9310f 100644
--- a/src/log/replica.cpp
+++ b/src/log/replica.cpp
@@ -36,6 +36,8 @@
#include <stout/stopwatch.hpp>
#include <stout/utils.hpp>
+#include "common/type_utils.hpp"
+
#include "log/replica.hpp"
#include "logging/logging.hpp"
@@ -59,13 +61,14 @@ namespace protocol {
// Some replica protocol definitions.
Protocol<PromiseRequest, PromiseResponse> promise;
Protocol<WriteRequest, WriteResponse> write;
+Protocol<RecoverRequest, RecoverResponse> recover;
} // namespace protocol {
struct State
{
- uint64_t proposal; // Last promise made.
+ Metadata metadata; // The metadata for the replica.
uint64_t begin; // Beginning position of the log.
uint64_t end; // Ending position of the log.
set<uint64_t> learned; // Positions present and learned
@@ -78,8 +81,8 @@ class Storage
{
public:
virtual ~Storage() {}
- virtual Try<State> recover(const string& path) = 0;
- virtual Try<Nothing> persist(const Promise& promise) = 0;
+ virtual Try<State> restore(const string& path) = 0;
+ virtual Try<Nothing> persist(const Metadata& metadata) = 0;
virtual Try<Nothing> persist(const Action& action) = 0;
virtual Try<Action> read(uint64_t position) = 0;
};
@@ -92,8 +95,8 @@ public:
LevelDBStorage();
virtual ~LevelDBStorage();
- virtual Try<State> recover(const string& path);
- virtual Try<Nothing> persist(const Promise& promise);
+ virtual Try<State> restore(const string& path);
+ virtual Try<Nothing> persist(const Metadata& metadata);
virtual Try<Nothing> persist(const Action& action);
virtual Try<Action> read(uint64_t position);
@@ -137,7 +140,8 @@ private:
// Returns a string representing the specified position. Note that
// we adjust the actual position by incrementing it by 1 because we
- // reserve 0 for storing the promise record (Record::Promise).
+ // reserve 0 for storing the promise record (Record::Promise,
+ // DEPRECATED!), or the metadata (Record::Metadata).
static string encode(uint64_t position, bool adjust = true)
{
// Adjusted stringified represenation is plus 1 of actual position.
@@ -194,7 +198,7 @@ LevelDBStorage::~LevelDBStorage()
}
-Try<State> LevelDBStorage::recover(const string& path)
+Try<State> LevelDBStorage::restore(const string& path)
{
leveldb::Options options;
options.create_if_missing = true;
@@ -236,7 +240,6 @@ Try<State> LevelDBStorage::recover(const string& path)
LOG(INFO) << "Compacted db in " << stopwatch.elapsed();
State state;
- state.proposal = 0;
state.begin = 0;
state.end = 0;
@@ -276,14 +279,18 @@ Try<State> LevelDBStorage::recover(const string& path)
switch (record.type()) {
case Record::METADATA: {
CHECK(record.has_metadata());
- state.proposal = record.metadata().promised();
+ state.metadata.CopyFrom(record.metadata());
break;
}
// DEPRECATED!
case Record::PROMISE: {
CHECK(record.has_promise());
- state.proposal = record.promise().proposal();
+ // This replica is in old format. Set its status to VOTING
+ // since there is no catch-up logic in the old code and this
+ // replica is obviously not empty.
+ state.metadata.set_status(Metadata::VOTING);
+ state.metadata.set_promised(record.promise().proposal());
break;
}
@@ -332,7 +339,7 @@ Try<State> LevelDBStorage::recover(const string& path)
}
-Try<Nothing> LevelDBStorage::persist(const Promise& promise)
+Try<Nothing> LevelDBStorage::persist(const Metadata& metadata)
{
Stopwatch stopwatch;
stopwatch.start();
@@ -341,8 +348,8 @@ Try<Nothing> LevelDBStorage::persist(const Promise& promise)
options.sync = true;
Record record;
- record.set_type(Record::PROMISE);
- record.mutable_promise()->MergeFrom(promise);
+ record.set_type(Record::METADATA);
+ record.mutable_metadata()->CopyFrom(metadata);
string value;
@@ -356,7 +363,7 @@ Try<Nothing> LevelDBStorage::persist(const Promise& promise)
return Error(status.ToString());
}
- LOG(INFO) << "Persisting promise (" << value.size()
+ LOG(INFO) << "Persisting metadata (" << value.size()
<< " bytes) to leveldb took " << stopwatch.elapsed();
return Nothing();
@@ -513,9 +520,16 @@ public:
// Returns the last written position in the log.
uint64_t ending();
+ // Returns the current status of the this replica.
+ Metadata::Status status();
+
// Returns the highest implicit promise this replica has given.
uint64_t promised();
+ // Updates the status of this replica. The update will persisted on
+ // the disk. Returns true on success and false otherwise.
+ bool update(const Metadata::Status& status);
+
private:
// Handles a request from a proposer to promise not to accept writes
// from any other proposer with lower proposal number.
@@ -524,22 +538,30 @@ private:
// Handles a request from a proposer to write an action.
void write(const WriteRequest& request);
+ // Handles a request from a recover process.
+ void recover(const RecoverRequest& request);
+
// Handles a message notifying of a learned action.
void learned(const Action& action);
// Helper routines that write a record corresponding to the
// specified argument. Returns true on success and false otherwise.
- bool persist(const Promise& promise);
bool persist(const Action& action);
- // Helper routine to recover log (e.g., on restart).
- void recover(const string& path);
+ // Helper routines that update metadata corresponding to the
+ // specified argument. The update will be persisted on the disk.
+ // Returns true on success and false otherwise.
+ bool update(uint64_t promised);
+
+ // Helper routine to restore log (e.g., on restart).
+ void restore(const string& path);
// Underlying storage for the log.
Storage* storage;
- // Last promise made to a proposer.
- uint64_t proposal;
+ // The cached metadata for this replica. It includes the current
+ // status of the replica and the last promise it made.
+ Metadata metadata;
// Beginning position of log (after *learned* truncations).
uint64_t begin;
@@ -557,14 +579,13 @@ private:
ReplicaProcess::ReplicaProcess(const string& path)
: ProcessBase(ID::generate("log-replica")),
- proposal(0),
begin(0),
end(0)
{
// TODO(benh): Factor out and expose storage.
storage = new LevelDBStorage();
- recover(path);
+ restore(path);
// Install protobuf handlers.
install<PromiseRequest>(
@@ -573,6 +594,9 @@ ReplicaProcess::ReplicaProcess(const string& path)
install<WriteRequest>(
&ReplicaProcess::write);
+ install<RecoverRequest>(
+ &ReplicaProcess::recover);
+
install<LearnedMessage>(
&ReplicaProcess::learned,
&LearnedMessage::action);
@@ -700,9 +724,59 @@ uint64_t ReplicaProcess::ending()
}
+Metadata::Status ReplicaProcess::status()
+{
+ return metadata.status();
+}
+
+
uint64_t ReplicaProcess::promised()
{
- return proposal;
+ return metadata.promised();
+}
+
+
+bool ReplicaProcess::update(const Metadata::Status& status)
+{
+ Metadata metadata_;
+ metadata_.set_status(status);
+ metadata_.set_promised(promised());
+
+ Try<Nothing> persisted = storage->persist(metadata_);
+
+ if (persisted.isError()) {
+ LOG(ERROR) << "Error writing to log: " << persisted.error();
+ return false;
+ }
+
+ LOG(INFO) << "Persisted replica status to " << status;
+
+ // Update the cached metadata.
+ metadata.set_status(status);
+
+ return true;
+}
+
+
+bool ReplicaProcess::update(uint64_t promised)
+{
+ Metadata metadata_;
+ metadata_.set_status(status());
+ metadata_.set_promised(promised);
+
+ Try<Nothing> persisted = storage->persist(metadata_);
+
+ if (persisted.isError()) {
+ LOG(ERROR) << "Error writing to log: " << persisted.error();
+ return false;
+ }
+
+ LOG(INFO) << "Persisted promised to " << promised;
+
+ // Update the cached metadata.
+ metadata.set_promised(promised);
+
+ return true;
}
@@ -722,6 +796,13 @@ uint64_t ReplicaProcess::promised()
void ReplicaProcess::promise(const PromiseRequest& request)
{
+ // Ignore promise requests if this replica is not in VOTING status.
+ if (status() != Metadata::VOTING) {
+ LOG(INFO) << "Replica ignoring promise request as it is in "
+ << status() << " status";
+ return;
+ }
+
if (request.has_position()) {
LOG(INFO) << "Replica received explicit promise request for position "
<< request.position() << " with proposal " << request.proposal();
@@ -742,8 +823,8 @@ void ReplicaProcess::promise(const PromiseRequest& request)
if (request.position() < begin) {
Action action;
action.set_position(request.position());
- action.set_promised(proposal); // Use the last promised proposal.
- action.set_performed(proposal); // Use the last promised proposal.
+ action.set_promised(promised()); // Use the last promised proposal.
+ action.set_performed(promised()); // Use the last promised proposal.
action.set_learned(true);
action.set_type(Action::NOP);
action.mutable_nop()->MergeFrom(Action::Nop());
@@ -776,14 +857,14 @@ void ReplicaProcess::promise(const PromiseRequest& request)
// As a result, proposer 1 can successfully write a value X to
// log position 1 and thinks that X is agreed, while proposer 2
// can later write a value Y and also believes that Y is agreed.
- if (request.proposal() <= proposal) {
+ if (request.proposal() <= promised()) {
// If a promise request is rejected because of the proposal
// number check, we reply with the currently promised proposal
// number so that the proposer can bump its proposal number
// and retry if needed to ensure liveness.
PromiseResponse response;
response.set_okay(false);
- response.set_proposal(proposal);
+ response.set_proposal(promised());
reply(response);
} else {
Action action;
@@ -825,20 +906,16 @@ void ReplicaProcess::promise(const PromiseRequest& request)
LOG(INFO) << "Replica received implicit promise request with proposal "
<< request.proposal();
- if (request.proposal() <= proposal) { // Only make an implicit promise once!
+ if (request.proposal() <= promised()) {
+ // Only make an implicit promise once!
LOG(INFO) << "Replica denying promise request with proposal "
<< request.proposal();
PromiseResponse response;
response.set_okay(false);
- response.set_proposal(proposal);
+ response.set_proposal(promised());
reply(response);
} else {
- Promise promise;
- promise.set_proposal(request.proposal());
-
- if (persist(promise)) {
- proposal = request.proposal();
-
+ if (update(request.proposal())) {
// Return the last position written.
PromiseResponse response;
response.set_okay(true);
@@ -853,6 +930,13 @@ void ReplicaProcess::promise(const PromiseRequest& request)
void ReplicaProcess::write(const WriteRequest& request)
{
+ // Ignore write requests if this replica is not in VOTING status.
+ if (status() != Metadata::VOTING) {
+ LOG(INFO) << "Replica ignoring write request as it is in "
+ << status() << " status";
+ return;
+ }
+
LOG(INFO) << "Replica received write request for position "
<< request.position();
@@ -862,16 +946,16 @@ void ReplicaProcess::write(const WriteRequest& request)
LOG(ERROR) << "Error getting log record at " << request.position()
<< ": " << result.error();
} else if (result.isNone()) {
- if (request.proposal() < proposal) {
+ if (request.proposal() < promised()) {
WriteResponse response;
response.set_okay(false);
- response.set_proposal(proposal);
+ response.set_proposal(promised());
response.set_position(request.position());
reply(response);
} else {
Action action;
action.set_position(request.position());
- action.set_promised(proposal);
+ action.set_promised(promised());
action.set_performed(request.proposal());
if (request.has_learned()) action.set_learned(request.learned());
action.set_type(request.type());
@@ -969,32 +1053,34 @@ void ReplicaProcess::write(const WriteRequest& request)
}
-void ReplicaProcess::learned(const Action& action)
+void ReplicaProcess::recover(const RecoverRequest& request)
{
- LOG(INFO) << "Replica received learned notice for position "
- << action.position();
+ LOG(INFO) << "Replica in " << status()
+ << " status received a broadcasted recover request";
- CHECK(action.learned());
+ RecoverResponse response;
+ response.set_status(status());
- if (persist(action)) {
- LOG(INFO) << "Replica learned " << Action::Type_Name(action.type())
- << " action at position " << action.position();
+ if (status() == Metadata::VOTING) {
+ response.set_begin(begin);
+ response.set_end(end);
}
+
+ reply(response);
}
-bool ReplicaProcess::persist(const Promise& promise)
+void ReplicaProcess::learned(const Action& action)
{
- Try<Nothing> persisted = storage->persist(promise);
-
- if (persisted.isError()) {
- LOG(ERROR) << "Error writing to log: " << persisted.error();
- return false;
- }
+ LOG(INFO) << "Replica received learned notice for position "
+ << action.position();
- LOG(INFO) << "Persisted promise to " << promise.proposal();
+ CHECK(action.learned());
- return true;
+ if (persist(action)) {
+ LOG(INFO) << "Replica learned " << action.type()
+ << " action at position " << action.position();
+ }
}
@@ -1052,14 +1138,14 @@ bool ReplicaProcess::persist(const Action& action)
}
-void ReplicaProcess::recover(const string& path)
+void ReplicaProcess::restore(const string& path)
{
- Try<State> state = storage->recover(path);
+ Try<State> state = storage->restore(path);
CHECK_SOME(state) << "Failed to recover the log";
// Pull out and save some of the state.
- proposal = state.get().proposal;
+ metadata = state.get().metadata;
begin = state.get().begin;
end = state.get().end;
unlearned = state.get().unlearned;
@@ -1134,12 +1220,28 @@ Future<uint64_t> Replica::ending() const
}
+Future<Metadata::Status> Replica::status() const
+{
+ return dispatch(process, &ReplicaProcess::status);
+}
+
+
Future<uint64_t> Replica::promised() const
{
return dispatch(process, &ReplicaProcess::promised);
}
+Future<bool> Replica::update(const Metadata::Status& status)
+{
+ // Need to disambiguate overloaded function.
+ bool (ReplicaProcess::*update)(const Metadata::Status& status) =
+ &ReplicaProcess::update;
+
+ return dispatch(process, update, status);
+}
+
+
PID<ReplicaProcess> Replica::pid() const
{
return process->self();
http://git-wip-us.apache.org/repos/asf/mesos/blob/f9b60c4c/src/log/replica.hpp
----------------------------------------------------------------------
diff --git a/src/log/replica.hpp b/src/log/replica.hpp
index 4cc7031..ecb126d 100644
--- a/src/log/replica.hpp
+++ b/src/log/replica.hpp
@@ -38,6 +38,7 @@ namespace protocol {
// Some replica protocol declarations.
extern Protocol<PromiseRequest, PromiseResponse> promise;
extern Protocol<WriteRequest, WriteResponse> write;
+extern Protocol<RecoverRequest, RecoverResponse> recover;
} // namespace protocol {
@@ -50,7 +51,11 @@ class Replica
{
public:
// Constructs a new replica process using specified path to a
- // directory for storing the underlying log.
+ // directory for storing the underlying log. If a replica starts
+ // with an empty log, it will not be allowed to vote (i.e., cannot
+ // reply to any request except the recover request). The recover
+ // process will later decide if this replica can be re-allowed to
+ // vote depending on the status of other replicas.
Replica(const std::string& path);
~Replica();
@@ -76,9 +81,15 @@ public:
// Returns the last written position in the log.
process::Future<uint64_t> ending() const;
+ // Returns the current status of this replica.
+ process::Future<Metadata::Status> status() const;
+
// Returns the highest implicit promise this replica has given.
process::Future<uint64_t> promised() const;
+ // Updates the status of this replica.
+ process::Future<bool> update(const Metadata::Status& status);
+
// Returns the PID associated with this replica.
process::PID<ReplicaProcess> pid() const;
http://git-wip-us.apache.org/repos/asf/mesos/blob/f9b60c4c/src/messages/log.proto
----------------------------------------------------------------------
diff --git a/src/messages/log.proto b/src/messages/log.proto
index 8fa46ca..d73b33f 100644
--- a/src/messages/log.proto
+++ b/src/messages/log.proto
@@ -79,8 +79,8 @@ message Metadata {
enum Status {
VOTING = 1; // Normal voting member in Paxos group.
RECOVERING = 2; // In the process of catching up.
- STARTING = 3; // Transient state between EMPTY and RECOVERING.
- EMPTY = 4; // Initial state if start with an empty log.
+ STARTING = 3; // The log has been initialized.
+ EMPTY = 4; // The log is empty and is not initialized.
}
required Status status = 1 [default = EMPTY];
@@ -179,3 +179,17 @@ message WriteResponse {
message LearnedMessage {
required Action action = 1;
}
+
+
+// Represents a recover request. A recover request is used to initiate
+// the recovery (by broadcasting it).
+message RecoverRequest {}
+
+
+// When a replica receives a RecoverRequest, it will reply with its
+// current status, and the begin and the end of its current log.
+message RecoverResponse {
+ required Metadata.Status status = 1;
+ optional uint64 begin = 2;
+ optional uint64 end = 3;
+}
http://git-wip-us.apache.org/repos/asf/mesos/blob/f9b60c4c/src/tests/log_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/log_tests.cpp b/src/tests/log_tests.cpp
index fb9bbd8..aeebfb7 100644
--- a/src/tests/log_tests.cpp
+++ b/src/tests/log_tests.cpp
@@ -18,6 +18,7 @@
#include <gmock/gmock.h>
+#include <list>
#include <set>
#include <string>
@@ -25,31 +26,32 @@
#include <process/future.hpp>
#include <process/gmock.hpp>
#include <process/gtest.hpp>
+#include <process/owned.hpp>
#include <process/pid.hpp>
#include <process/protobuf.hpp>
-#include <process/timeout.hpp>
+#include <process/shared.hpp>
#include <stout/gtest.hpp>
+#include <stout/none.hpp>
#include <stout/option.hpp>
#include <stout/os.hpp>
-
-#include "common/type_utils.hpp"
+#include <stout/path.hpp>
+#include <stout/try.hpp>
#include "log/coordinator.hpp"
#include "log/log.hpp"
+#include "log/network.hpp"
+#include "log/recover.hpp"
#include "log/replica.hpp"
-#include "messages/messages.hpp"
+#include "tests/utils.hpp"
using namespace mesos;
using namespace mesos::internal;
using namespace mesos::internal::log;
+using namespace mesos::internal::tests;
-using process::Clock;
-using process::Future;
-using process::Timeout;
-using process::Shared;
-using process::UPID;
+using namespace process;
using std::list;
using std::set;
@@ -59,9 +61,6 @@ using testing::_;
using testing::Eq;
using testing::Return;
-#include "tests/utils.hpp"
-
-using namespace mesos::internal::tests;
class ReplicaTest : public TemporaryDirectoryTest {};
@@ -175,7 +174,7 @@ TEST_F(ReplicaTest, Append)
}
-TEST_F(ReplicaTest, Recover)
+TEST_F(ReplicaTest, Restore)
{
const string path = os::getcwd() + "/.log";
@@ -278,9 +277,10 @@ TEST_F(CoordinatorTest, Elect)
Coordinator coord(2, replica1, network);
{
- Result<uint64_t> result = coord.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(0u, result.get());
+ Future<Option<uint64_t> > electing = coord.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(0u, electing.get().get());
}
{
@@ -311,18 +311,18 @@ TEST_F(CoordinatorTest, AppendRead)
Coordinator coord(2, replica1, network);
{
- Result<uint64_t> result = coord.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(0u, result.get());
+ Future<Option<uint64_t> > electing = coord.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(0u, electing.get().get());
}
uint64_t position;
{
- Result<uint64_t> result2 =
- coord.append("hello world", Timeout::in(Seconds(10)));
- ASSERT_SOME(result2);
- position = result2.get();
+ Future<uint64_t> appending = coord.append("hello world");
+ AWAIT_READY_FOR(appending, Seconds(10));
+ position = appending.get();
EXPECT_EQ(1u, position);
}
@@ -355,18 +355,18 @@ TEST_F(CoordinatorTest, AppendReadError)
Coordinator coord(2, replica1, network);
{
- Result<uint64_t> result = coord.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(0u, result.get());
+ Future<Option<uint64_t> > electing = coord.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(0u, electing.get().get());
}
uint64_t position;
{
- Result<uint64_t> result2 =
- coord.append("hello world", Timeout::in(Seconds(10)));
- ASSERT_SOME(result2);
- position = result2.get();
+ Future<uint64_t> appending = coord.append("hello world");
+ AWAIT_READY_FOR(appending, Seconds(10));
+ position = appending.get();
EXPECT_EQ(1u, position);
}
@@ -394,15 +394,12 @@ TEST_F(CoordinatorTest, ElectNoQuorum)
Clock::pause();
- // Create a timeout here so that we can advance time.
- Timeout timeout = Timeout::in(Seconds(10));
+ Future<Option<uint64_t> > electing = coord.elect();
Clock::advance(Seconds(10));
+ Clock::settle();
- {
- Result<uint64_t> result = coord.elect(timeout);
- EXPECT_TRUE(result.isNone());
- }
+ EXPECT_TRUE(electing.isPending());
Clock::resume();
}
@@ -425,9 +422,10 @@ TEST_F(CoordinatorTest, AppendNoQuorum)
Coordinator coord(2, replica1, network);
{
- Result<uint64_t> result = coord.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(0u, result.get());
+ Future<Option<uint64_t> > electing = coord.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(0u, electing.get().get());
}
process::terminate(replica2->pid());
@@ -436,15 +434,12 @@ TEST_F(CoordinatorTest, AppendNoQuorum)
Clock::pause();
- // Create a timeout here so that we can advance time.
- Timeout timeout = Timeout::in(Seconds(10));
+ Future<uint64_t> appending = coord.append("hello world");
Clock::advance(Seconds(10));
+ Clock::settle();
- {
- Result<uint64_t> result = coord.append("hello world", timeout);
- EXPECT_TRUE(result.isNone());
- }
+ EXPECT_TRUE(appending.isPending());
Clock::resume();
}
@@ -467,18 +462,18 @@ TEST_F(CoordinatorTest, Failover)
Coordinator coord1(2, replica1, network1);
{
- Result<uint64_t> result = coord1.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(0u, result.get());
+ Future<Option<uint64_t> > electing = coord1.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(0u, electing.get().get());
}
uint64_t position;
{
- Result<uint64_t> result =
- coord1.append("hello world", Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- position = result.get();
+ Future<uint64_t> appending = coord1.append("hello world");
+ AWAIT_READY_FOR(appending, Seconds(10));
+ position = appending.get();
EXPECT_EQ(1u, position);
}
@@ -487,9 +482,10 @@ TEST_F(CoordinatorTest, Failover)
Coordinator coord2(2, replica2, network2);
{
- Result<uint64_t> result = coord2.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(position, result.get());
+ Future<Option<uint64_t> > electing = coord2.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(position, electing.get().get());
}
{
@@ -521,18 +517,18 @@ TEST_F(CoordinatorTest, Demoted)
Coordinator coord1(2, replica1, network1);
{
- Result<uint64_t> result = coord1.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(0u, result.get());
+ Future<Option<uint64_t> > electing = coord1.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(0u, electing.get().get());
}
uint64_t position;
{
- Result<uint64_t> result =
- coord1.append("hello world", Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- position = result.get();
+ Future<uint64_t> appending = coord1.append("hello world");
+ AWAIT_READY_FOR(appending, Seconds(10));
+ position = appending.get();
EXPECT_EQ(1u, position);
}
@@ -541,23 +537,22 @@ TEST_F(CoordinatorTest, Demoted)
Coordinator coord2(2, replica2, network2);
{
- Result<uint64_t> result = coord2.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(position, result.get());
+ Future<Option<uint64_t> > electing = coord2.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(position, electing.get().get());
}
{
- Result<uint64_t> result =
- coord1.append("hello moto", Timeout::in(Seconds(10)));
- ASSERT_TRUE(result.isError());
- EXPECT_EQ("Coordinator demoted", result.error());
+ Future<uint64_t> appending = coord1.append("hello moto");
+ AWAIT_FAILED(appending);
+ EXPECT_EQ("Coordinator demoted", appending.failure());
}
{
- Result<uint64_t> result =
- coord2.append("hello hello", Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- position = result.get();
+ Future<uint64_t> appending = coord2.append("hello hello");
+ AWAIT_READY_FOR(appending, Seconds(10));
+ position = appending.get();
EXPECT_EQ(2u, position);
}
@@ -591,18 +586,18 @@ TEST_F(CoordinatorTest, Fill)
Coordinator coord1(2, replica1, network1);
{
- Result<uint64_t> result = coord1.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(0u, result.get());
+ Future<Option<uint64_t> > electing = coord1.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(0u, electing.get().get());
}
uint64_t position;
{
- Result<uint64_t> result =
- coord1.append("hello world", Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- position = result.get();
+ Future<uint64_t> appending = coord1.append("hello world");
+ AWAIT_READY_FOR(appending, Seconds(10));
+ position = appending.get();
EXPECT_EQ(1u, position);
}
@@ -617,11 +612,14 @@ TEST_F(CoordinatorTest, Fill)
Coordinator coord2(2, replica3, network2);
{
- Result<uint64_t> result = coord2.elect(Timeout::in(Seconds(10)));
- ASSERT_TRUE(result.isNone());
- result = coord2.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(position, result.get());
+ Future<Option<uint64_t> > electing = coord2.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_NONE(electing.get());
+
+ electing = coord2.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(position, electing.get().get());
}
{
@@ -658,18 +656,18 @@ TEST_F(CoordinatorTest, NotLearnedFill)
Coordinator coord1(2, replica1, network1);
{
- Result<uint64_t> result = coord1.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(0u, result.get());
+ Future<Option<uint64_t> > electing = coord1.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(0u, electing.get().get());
}
uint64_t position;
{
- Result<uint64_t> result =
- coord1.append("hello world", Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- position = result.get();
+ Future<uint64_t> appending = coord1.append("hello world");
+ AWAIT_READY_FOR(appending, Seconds(10));
+ position = appending.get();
EXPECT_EQ(1u, position);
}
@@ -684,11 +682,14 @@ TEST_F(CoordinatorTest, NotLearnedFill)
Coordinator coord2(2, replica3, network2);
{
- Result<uint64_t> result = coord2.elect(Timeout::in(Seconds(10)));
- ASSERT_TRUE(result.isNone());
- result = coord2.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(position, result.get());
+ Future<Option<uint64_t> > electing = coord2.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_NONE(electing.get());
+
+ electing = coord2.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(position, electing.get().get());
}
{
@@ -720,16 +721,16 @@ TEST_F(CoordinatorTest, MultipleAppends)
Coordinator coord(2, replica1, network);
{
- Result<uint64_t> result = coord.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(0u, result.get());
+ Future<Option<uint64_t> > electing = coord.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(0u, electing.get().get());
}
for (uint64_t position = 1; position <= 10; position++) {
- Result<uint64_t> result =
- coord.append(stringify(position), Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(position, result.get());
+ Future<uint64_t> appending = coord.append(stringify(position));
+ AWAIT_READY_FOR(appending, Seconds(10));
+ EXPECT_EQ(position, appending.get());
}
{
@@ -767,16 +768,16 @@ TEST_F(CoordinatorTest, MultipleAppendsNotLearnedFill)
Coordinator coord1(2, replica1, network1);
{
- Result<uint64_t> result = coord1.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(0u, result.get());
+ Future<Option<uint64_t> > electing = coord1.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(0u, electing.get().get());
}
for (uint64_t position = 1; position <= 10; position++) {
- Result<uint64_t> result =
- coord1.append(stringify(position), Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(position, result.get());
+ Future<uint64_t> appending = coord1.append(stringify(position));
+ AWAIT_READY_FOR(appending, Seconds(10));
+ EXPECT_EQ(position, appending.get());
}
Shared<Replica> replica3(new Replica(path3));
@@ -790,11 +791,14 @@ TEST_F(CoordinatorTest, MultipleAppendsNotLearnedFill)
Coordinator coord2(2, replica3, network2);
{
- Result<uint64_t> result = coord2.elect(Timeout::in(Seconds(10)));
- ASSERT_TRUE(result.isNone());
- result = coord2.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(10u, result.get());
+ Future<Option<uint64_t> > electing = coord2.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_NONE(electing.get());
+
+ electing = coord2.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(10u, electing.get().get());
}
{
@@ -827,22 +831,22 @@ TEST_F(CoordinatorTest, Truncate)
Coordinator coord(2, replica1, network);
{
- Result<uint64_t> result = coord.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(0u, result.get());
+ Future<Option<uint64_t> > electing = coord.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(0u, electing.get().get());
}
for (uint64_t position = 1; position <= 10; position++) {
- Result<uint64_t> result =
- coord.append(stringify(position), Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(position, result.get());
+ Future<uint64_t> appending = coord.append(stringify(position));
+ AWAIT_READY_FOR(appending, Seconds(10));
+ EXPECT_EQ(position, appending.get());
}
{
- Result<uint64_t> result = coord.truncate(7, Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(11u, result.get());
+ Future<uint64_t> truncating = coord.truncate(7);
+ AWAIT_READY_FOR(truncating, Seconds(10));
+ EXPECT_EQ(11u, truncating.get());
}
{
@@ -886,22 +890,22 @@ TEST_F(CoordinatorTest, TruncateNotLearnedFill)
Coordinator coord1(2, replica1, network1);
{
- Result<uint64_t> result = coord1.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(0u, result.get());
+ Future<Option<uint64_t> > electing = coord1.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(0u, electing.get().get());
}
for (uint64_t position = 1; position <= 10; position++) {
- Result<uint64_t> result =
- coord1.append(stringify(position), Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(position, result.get());
+ Future<uint64_t> appending = coord1.append(stringify(position));
+ AWAIT_READY_FOR(appending, Seconds(10));
+ EXPECT_EQ(position, appending.get());
}
{
- Result<uint64_t> result = coord1.truncate(7, Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(11u, result.get());
+ Future<uint64_t> truncating = coord1.truncate(7);
+ AWAIT_READY_FOR(truncating, Seconds(10));
+ EXPECT_EQ(11u, truncating.get());
}
Shared<Replica> replica3(new Replica(path3));
@@ -915,11 +919,14 @@ TEST_F(CoordinatorTest, TruncateNotLearnedFill)
Coordinator coord2(2, replica3, network2);
{
- Result<uint64_t> result = coord2.elect(Timeout::in(Seconds(10)));
- ASSERT_TRUE(result.isNone());
- result = coord2.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(11u, result.get());
+ Future<Option<uint64_t> > electing = coord2.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_NONE(electing.get());
+
+ electing = coord2.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(11u, electing.get().get());
}
{
@@ -959,22 +966,22 @@ TEST_F(CoordinatorTest, TruncateLearnedFill)
Coordinator coord1(2, replica1, network1);
{
- Result<uint64_t> result = coord1.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(0u, result.get());
+ Future<Option<uint64_t> > electing = coord1.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(0u, electing.get().get());
}
for (uint64_t position = 1; position <= 10; position++) {
- Result<uint64_t> result =
- coord1.append(stringify(position), Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(position, result.get());
+ Future<uint64_t> appending = coord1.append(stringify(position));
+ AWAIT_READY_FOR(appending, Seconds(10));
+ EXPECT_EQ(position, appending.get());
}
{
- Result<uint64_t> result = coord1.truncate(7, Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(11u, result.get());
+ Future<uint64_t> truncating = coord1.truncate(7);
+ AWAIT_READY_FOR(truncating, Seconds(10));
+ EXPECT_EQ(11u, truncating.get());
}
Shared<Replica> replica3(new Replica(path3));
@@ -988,11 +995,14 @@ TEST_F(CoordinatorTest, TruncateLearnedFill)
Coordinator coord2(2, replica3, network2);
{
- Result<uint64_t> result = coord2.elect(Timeout::in(Seconds(10)));
- ASSERT_TRUE(result.isNone());
- result = coord2.elect(Timeout::in(Seconds(10)));
- ASSERT_SOME(result);
- EXPECT_EQ(11u, result.get());
+ Future<Option<uint64_t> > electing = coord2.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_NONE(electing.get());
+
+ electing = coord2.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(11u, electing.get().get());
}
{
@@ -1014,6 +1024,103 @@ TEST_F(CoordinatorTest, TruncateLearnedFill)
}
+class RecoverTest : public TemporaryDirectoryTest {};
+
+
+// Two logs both need recovery compete with each other.
+TEST_F(RecoverTest, RacingCatchup)
+{
+ const string path1 = os::getcwd() + "/.log1";
+ const string path2 = os::getcwd() + "/.log2";
+ const string path3 = os::getcwd() + "/.log3";
+ const string path4 = os::getcwd() + "/.log4";
+ const string path5 = os::getcwd() + "/.log5";
+
+ Shared<Replica> replica1(new Replica(path1));
+ Shared<Replica> replica2(new Replica(path2));
+ Shared<Replica> replica3(new Replica(path3));
+
+ set<UPID> pids;
+ pids.insert(replica1->pid());
+ pids.insert(replica2->pid());
+ pids.insert(replica3->pid());
+
+ Shared<Network> network1(new Network(pids));
+
+ Coordinator coord1(3, replica1, network1);
+
+ {
+ Future<Option<uint64_t> > electing = coord1.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(0u, electing.get().get());
+ }
+
+ for (uint64_t position = 1; position <= 10; position++) {
+ Future<uint64_t> appending = coord1.append(stringify(position));
+ AWAIT_READY_FOR(appending, Seconds(10));
+ EXPECT_EQ(position, appending.get());
+ }
+
+ // Two replicas both want to recover.
+ Owned<Replica> replica4(new Replica(path4));
+ Owned<Replica> replica5(new Replica(path5));
+
+ pids.insert(replica4->pid());
+ pids.insert(replica5->pid());
+
+ Shared<Network> network2(new Network(pids));
+
+ Future<Owned<Replica> > recovering4 = recover(3, replica4, network2);
+ Future<Owned<Replica> > recovering5 = recover(3, replica5, network2);
+
+ // Wait until recovery is done.
+ AWAIT_READY(recovering4);
+ AWAIT_READY(recovering5);
+
+ Shared<Replica> shared4 = recovering4.get().share();
+ Coordinator coord2(2, shared4, network2);
+
+ {
+ Future<Option<uint64_t> > electing = coord2.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_NONE(electing.get());
+
+ electing = coord2.elect();
+ AWAIT_READY_FOR(electing, Seconds(10));
+ ASSERT_SOME(electing.get());
+ EXPECT_EQ(10u, electing.get().get());
+ }
+
+ {
+ Future<list<Action> > actions = shared4->read(1, 10);
+ AWAIT_READY(actions);
+ EXPECT_EQ(10u, actions.get().size());
+ foreach (const Action& action, actions.get()) {
+ ASSERT_TRUE(action.has_type());
+ ASSERT_EQ(Action::APPEND, action.type());
+ EXPECT_EQ(stringify(action.position()), action.append().bytes());
+ }
+ }
+
+ {
+ Future<uint64_t> appending = coord2.append("hello hello");
+ AWAIT_READY_FOR(appending, Seconds(10));
+ EXPECT_EQ(11u, appending.get());
+ }
+
+ {
+ Future<list<Action> > actions = shared4->read(11u, 11u);
+ AWAIT_READY(actions);
+ ASSERT_EQ(1u, actions.get().size());
+ EXPECT_EQ(11u, actions.get().front().position());
+ ASSERT_TRUE(actions.get().front().has_type());
+ ASSERT_EQ(Action::APPEND, actions.get().front().type());
+ EXPECT_EQ("hello hello", actions.get().front().append().bytes());
+ }
+}
+
+
class LogTest : public TemporaryDirectoryTest {};