You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2013/11/26 01:00:15 UTC
[1/2] git commit: Drop messages from non-leading Masters in the
scheduler driver.
Updated Branches:
refs/heads/master f051c4535 -> 718eb711c
Drop messages from non-leading Masters in the scheduler driver.
Review: https://reviews.apache.org/r/15778
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/57c2b728
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/57c2b728
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/57c2b728
Branch: refs/heads/master
Commit: 57c2b7288590b7e8a454658f9a1bd208acf0a62c
Parents: f051c45
Author: Benjamin Mahler <bm...@twitter.com>
Authored: Fri Nov 22 00:20:57 2013 -0800
Committer: Benjamin Mahler <bm...@twitter.com>
Committed: Mon Nov 25 14:56:35 2013 -0800
----------------------------------------------------------------------
src/sched/sched.cpp | 107 +++++++++++++++++++++++++++----
src/tests/fault_tolerance_tests.cpp | 89 +++++++++++++++++++++++++
2 files changed, 183 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/57c2b728/src/sched/sched.cpp
----------------------------------------------------------------------
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index 51f95bb..af57f28 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -356,7 +356,10 @@ protected:
}
}
- void registered(const FrameworkID& frameworkId, const MasterInfo& masterInfo)
+ void registered(
+ const UPID& from,
+ const FrameworkID& frameworkId,
+ const MasterInfo& masterInfo)
{
if (aborted) {
VLOG(1) << "Ignoring framework registered message because "
@@ -370,6 +373,13 @@ protected:
return;
}
+ if (!master.isSome() || from != master.get()) {
+ VLOG(1) << "Ignoring framework registered message because it was sent "
+ << "from '" << from << "' instead of the leading master '"
+ << (master.isSome() ? master.get() : UPID()) << "'";
+ return;
+ }
+
VLOG(1) << "Framework registered with " << frameworkId;
framework.mutable_id()->MergeFrom(frameworkId);
@@ -387,7 +397,10 @@ protected:
VLOG(1) << "Scheduler::registered took " << stopwatch.elapsed();
}
- void reregistered(const FrameworkID& frameworkId, const MasterInfo& masterInfo)
+ void reregistered(
+ const UPID& from,
+ const FrameworkID& frameworkId,
+ const MasterInfo& masterInfo)
{
if (aborted) {
VLOG(1) << "Ignoring framework re-registered message because "
@@ -401,6 +414,13 @@ protected:
return;
}
+ if (!master.isSome() || from != master.get()) {
+ VLOG(1) << "Ignoring framework re-registered message because it was sent "
+ << "from '" << from << "' instead of the leading master '"
+ << (master.isSome() ? master.get() : UPID()) << "'";
+ return;
+ }
+
VLOG(1) << "Framework re-registered with " << frameworkId;
CHECK(framework.id() == frameworkId);
@@ -444,8 +464,10 @@ protected:
delay(Seconds(1), self(), &Self::doReliableRegistration);
}
- void resourceOffers(const vector<Offer>& offers,
- const vector<string>& pids)
+ void resourceOffers(
+ const UPID& from,
+ const vector<Offer>& offers,
+ const vector<string>& pids)
{
if (aborted) {
VLOG(1) << "Ignoring resource offers message because "
@@ -453,6 +475,21 @@ protected:
return;
}
+ if (!connected) {
+ VLOG(1) << "Ignoring resource offers message because the driver is "
+ << "disconnected!";
+ return;
+ }
+
+ CHECK_SOME(master);
+
+ if (from != master.get()) {
+ VLOG(1) << "Ignoring resource offers message because it was sent "
+ << "from '" << from << "' instead of the leading master '"
+ << master.get() << "'";
+ return;
+ }
+
VLOG(2) << "Received " << offers.size() << " offers";
CHECK(offers.size() == pids.size());
@@ -480,7 +517,7 @@ protected:
VLOG(1) << "Scheduler::resourceOffers took " << stopwatch.elapsed();
}
- void rescindOffer(const OfferID& offerId)
+ void rescindOffer(const UPID& from, const OfferID& offerId)
{
if (aborted) {
VLOG(1) << "Ignoring rescind offer message because "
@@ -488,6 +525,21 @@ protected:
return;
}
+ if (!connected) {
+ VLOG(1) << "Ignoring rescind offer message because the driver is "
+ << "disconnected!";
+ return;
+ }
+
+ CHECK_SOME(master);
+
+ if (from != master.get()) {
+ VLOG(1) << "Ignoring rescind offer message because it was sent "
+ << "from '" << from << "' instead of the leading master '"
+ << master.get() << "'";
+ return;
+ }
+
VLOG(1) << "Rescinded offer " << offerId;
savedOffers.erase(offerId);
@@ -502,7 +554,10 @@ protected:
VLOG(1) << "Scheduler::offerRescinded took " << stopwatch.elapsed();
}
- void statusUpdate(const StatusUpdate& update, const UPID& pid)
+ void statusUpdate(
+ const UPID& from,
+ const StatusUpdate& update,
+ const UPID& pid)
{
const TaskStatus& status = update.status();
@@ -512,6 +567,24 @@ protected:
return;
}
+ // Allow status updates created from the driver itself.
+ if (from != UPID()) {
+ if (!connected) {
+ VLOG(1) << "Ignoring status update message because the driver is "
+ << "disconnected!";
+ return;
+ }
+
+ CHECK_SOME(master);
+
+ if (from != master.get()) {
+ VLOG(1) << "Ignoring status update message because it was sent "
+ << "from '" << from << "' instead of the leading master '"
+ << master.get() << "'";
+ return;
+ }
+ }
+
VLOG(2) << "Received status update " << update << " from " << pid;
CHECK(framework.id() == update.framework_id());
@@ -569,10 +642,18 @@ protected:
return;
}
- if (!master.isSome() || from != master.get()) {
- LOG(WARNING) << "Ignoring lost slave message from " << from
- << " because it is not from the registered master ("
- << (master.isSome() ? master.get() : "NONE/ERROR") << ")";
+ if (!connected) {
+ VLOG(1) << "Ignoring lost slave message because the driver is "
+ << "disconnected!";
+ return;
+ }
+
+ CHECK_SOME(master);
+
+ if (from != master.get()) {
+ VLOG(1) << "Ignoring lost slave message because it was sent "
+ << "from '" << from << "' instead of the leading master '"
+ << master.get() << "'";
return;
}
@@ -732,7 +813,7 @@ protected:
update.set_timestamp(Clock::now().secs());
update.set_uuid(UUID::random().toBytes());
- statusUpdate(update, UPID());
+ statusUpdate(UPID(), update, UPID());
}
return;
}
@@ -753,7 +834,7 @@ protected:
update.set_timestamp(Clock::now().secs());
update.set_uuid(UUID::random().toBytes());
- statusUpdate(update, UPID());
+ statusUpdate(UPID(), update, UPID());
continue;
}
@@ -773,7 +854,7 @@ protected:
update.set_timestamp(Clock::now().secs());
update.set_uuid(UUID::random().toBytes());
- statusUpdate(update, UPID());
+ statusUpdate(UPID(), update, UPID());
continue;
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/57c2b728/src/tests/fault_tolerance_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/fault_tolerance_tests.cpp b/src/tests/fault_tolerance_tests.cpp
index 6cb5829..6fb140e 100644
--- a/src/tests/fault_tolerance_tests.cpp
+++ b/src/tests/fault_tolerance_tests.cpp
@@ -1768,3 +1768,92 @@ TEST_F(FaultToleranceTest, ReconcileIncompleteTasks)
Shutdown();
}
+
+
+// This test ensures that if a master incorrectly thinks that it is
+// leading, the scheduler driver will drop messages from this master.
+// Unfortunately, it is not currently possible to start more than one
+// master within the same process. So, this test merely simulates this
+// by spoofing messages.
+// This test does the following:
+// 1. Start a master, scheduler, launch a task.
+// 2. Spoof a lost task message for the slave.
+// 3. Once the message is sent to the scheduler, kill the task.
+// 4. Ensure the task was KILLED rather than LOST.
+TEST_F(FaultToleranceTest, SplitBrainMasters)
+{
+ // 1. Start a master, scheduler, and launch a task.
+ Try<PID<Master> > master = StartMaster();
+ ASSERT_SOME(master);
+
+ MockExecutor exec(DEFAULT_EXECUTOR_ID);
+
+ Try<PID<Slave> > slave = StartSlave(&exec);
+ ASSERT_SOME(slave);
+
+ MockScheduler sched;
+ MesosSchedulerDriver driver(
+ &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
+
+ Future<Message> registered =
+ FUTURE_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _);
+
+ Future<FrameworkID> frameworkId;
+ EXPECT_CALL(sched, registered(&driver, _, _))
+ .WillOnce(FutureArg<1>(&frameworkId));
+
+ EXPECT_CALL(sched, resourceOffers(&driver, _))
+ .WillOnce(LaunchTasks(DEFAULT_EXECUTOR_INFO, 1, 1, 512, "*"))
+ .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+ EXPECT_CALL(exec, registered(_, _, _, _));
+
+ EXPECT_CALL(exec, launchTask(_, _))
+ .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING));
+
+ Future<TaskStatus> runningStatus;
+ EXPECT_CALL(sched, statusUpdate(&driver, _))
+ .WillOnce(FutureArg<1>(&runningStatus));
+
+ driver.start();
+
+ AWAIT_READY(registered);
+ AWAIT_READY(frameworkId);
+ AWAIT_READY(runningStatus);
+ EXPECT_EQ(TASK_RUNNING, runningStatus.get().state());
+
+ // 2. Spoof a lost task message for the slave.
+ StatusUpdateMessage lostUpdate;
+ lostUpdate.mutable_update()->CopyFrom(createStatusUpdate(
+ frameworkId.get(),
+ runningStatus.get().slave_id(),
+ runningStatus.get().task_id(),
+ TASK_LOST));
+
+ // Spoof a message from a random master; this should be dropped by
+ // the scheduler driver. Since this is delivered locally, it is
+ // synchronously placed on the scheduler driver's queue.
+ process::post(UPID("master2@127.0.0.1:50"), registered.get().to, lostUpdate);
+
+ // 3. Once the message is sent to the scheduler, kill the task.
+ EXPECT_CALL(exec, killTask(_, _))
+ .WillOnce(SendStatusUpdateFromTaskID(TASK_KILLED));
+
+ Future<TaskStatus> killedStatus;
+ EXPECT_CALL(sched, statusUpdate(&driver, _))
+ .WillOnce(FutureArg<1>(&killedStatus));
+
+ driver.killTask(runningStatus.get().task_id());
+
+ // 4. Ensure the task was KILLED rather than LOST.
+ AWAIT_READY(killedStatus);
+ EXPECT_EQ(TASK_KILLED, killedStatus.get().state());
+
+ EXPECT_CALL(exec, shutdown(_))
+ .WillRepeatedly(Return());
+
+ driver.stop();
+ driver.join();
+
+ Shutdown();
+}
[2/2] git commit: Added build instructions for Mac OS X Mavericks.
Posted by bm...@apache.org.
Added build instructions for Mac OS X Mavericks.
From: Niklas Nielsen <ni...@qni.dk>
Review: https://reviews.apache.org/r/15727
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/718eb711
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/718eb711
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/718eb711
Branch: refs/heads/master
Commit: 718eb711cb8a2984536e3fee9e6ade53c9977bfc
Parents: 57c2b72
Author: Benjamin Mahler <bm...@twitter.com>
Authored: Mon Nov 25 15:57:16 2013 -0800
Committer: Benjamin Mahler <bm...@twitter.com>
Committed: Mon Nov 25 15:59:56 2013 -0800
----------------------------------------------------------------------
docs/getting-started.md | 44 +++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 43 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/718eb711/docs/getting-started.md
----------------------------------------------------------------------
diff --git a/docs/getting-started.md b/docs/getting-started.md
index 3986dab..3ab2d0f 100644
--- a/docs/getting-started.md
+++ b/docs/getting-started.md
@@ -56,6 +56,12 @@ If you are building from git repository, you will need to additionally install t
> 2. Mesos is currently being developed/tested/supported on 64 Bit machines only.
+> 3. Currently, Mesos does not build with the Clang compiler. Up and until Mac OS X Mountain Lion, `gcc` and `g++` were GNU GCC compiler frontends for LLVM.
+This changed with Mac OS X Mavericks and `gcc` and `g++` are now Clang wrappers which compiles against a new standard C library.
+This unfortunately breaks our current build process, so please follow the separate instructions below to get Mesos building on Mavericks.
+
+> 4. Mesos does not build with automake 1.14. Please make sure an older automake, for example version 1.12, is installed.
+
## Building Mesos
@@ -77,6 +83,42 @@ If you are building from git repository, you will need to additionally install t
# Install (***Optional***).
$ make install
+### Building Mesos on Mac OS X Mavericks
+
+ # Make sure you have the right versions of automake and GCC.
+ # For example:
+ $ automake --version
+ automake (GNU automake) 1.12.6
+ ...
+
+ $ aclocal --version
+ aclocal (GNU automake) 1.12.6
+ ...
+
+ $ gcc-4.8 --version
+ gcc-4.8 (GCC) 4.8.1
+ ...
+
+ # Follow bootstrap steps above.
+
+ # Configure and build.
+ $ mkdir build
+ $ cd build
+
+ # The prepending CC and CXX will select custom compilers to build Mesos with.
+ $ CC=gcc-4.8 CXX=g++-4.8 ../configure
+
+ # Follow make steps above.
+
+***NOTES***
+
+> 1. Link errors might occur if clang versions of `gcc` and `g++` were used to compile some of the object files.
+This happens if configure was run without `CC` and `CXX` set and `make` subsequently failed.
+Make sure the build directory is completely empty before running configure.
+> 2. Python framework test failure is expected on Mavericks.
+This is due to the system Python binary usually being compiled against the new (non-GNU) standard C library.
+Either use a GCC built Python and select it with `PYTHON=<location> CC=gcc-4.8 CXX=g++-4.8 ../configure` or disable Python support with `CC=gcc-4.8 CXX=g++-4.8 ../configure --disable-python`.
+
## Examples
Mesos comes bundled with example frameworks written in `C++`, `Java` and `Python`.
@@ -101,4 +143,4 @@ Mesos comes bundled with example frameworks written in `C++`, `Java` and `Python
# Run Python framework (***Exits after successfully running some tasks.***).
$ ./src/examples/python/test-framework 127.0.0.1:5050
-*NOTE: To build the example frameworks, make sure you build the test suite by doing `make check`.*
\ No newline at end of file
+*NOTE: To build the example frameworks, make sure you build the test suite by doing `make check`.*