You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2013/11/26 01:00:15 UTC

[1/2] git commit: Drop messages from non-leading Masters in the scheduler driver.

Updated Branches:
  refs/heads/master f051c4535 -> 718eb711c


Drop messages from non-leading Masters in the scheduler driver.

Review: https://reviews.apache.org/r/15778


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/57c2b728
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/57c2b728
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/57c2b728

Branch: refs/heads/master
Commit: 57c2b7288590b7e8a454658f9a1bd208acf0a62c
Parents: f051c45
Author: Benjamin Mahler <bm...@twitter.com>
Authored: Fri Nov 22 00:20:57 2013 -0800
Committer: Benjamin Mahler <bm...@twitter.com>
Committed: Mon Nov 25 14:56:35 2013 -0800

----------------------------------------------------------------------
 src/sched/sched.cpp                 | 107 +++++++++++++++++++++++++++----
 src/tests/fault_tolerance_tests.cpp |  89 +++++++++++++++++++++++++
 2 files changed, 183 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/57c2b728/src/sched/sched.cpp
----------------------------------------------------------------------
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index 51f95bb..af57f28 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -356,7 +356,10 @@ protected:
     }
   }
 
-  void registered(const FrameworkID& frameworkId, const MasterInfo& masterInfo)
+  void registered(
+      const UPID& from,
+      const FrameworkID& frameworkId,
+      const MasterInfo& masterInfo)
   {
     if (aborted) {
       VLOG(1) << "Ignoring framework registered message because "
@@ -370,6 +373,13 @@ protected:
       return;
     }
 
+    if (!master.isSome() || from != master.get()) {
+      VLOG(1) << "Ignoring framework registered message because it was sent "
+              << "from '" << from << "' instead of the leading master '"
+              << (master.isSome() ? master.get() : UPID()) << "'";
+      return;
+    }
+
     VLOG(1) << "Framework registered with " << frameworkId;
 
     framework.mutable_id()->MergeFrom(frameworkId);
@@ -387,7 +397,10 @@ protected:
     VLOG(1) << "Scheduler::registered took " << stopwatch.elapsed();
   }
 
-  void reregistered(const FrameworkID& frameworkId, const MasterInfo& masterInfo)
+  void reregistered(
+      const UPID& from,
+      const FrameworkID& frameworkId,
+      const MasterInfo& masterInfo)
   {
     if (aborted) {
       VLOG(1) << "Ignoring framework re-registered message because "
@@ -401,6 +414,13 @@ protected:
       return;
     }
 
+    if (!master.isSome() || from != master.get()) {
+      VLOG(1) << "Ignoring framework re-registered message because it was sent "
+              << "from '" << from << "' instead of the leading master '"
+              << (master.isSome() ? master.get() : UPID()) << "'";
+      return;
+    }
+
     VLOG(1) << "Framework re-registered with " << frameworkId;
 
     CHECK(framework.id() == frameworkId);
@@ -444,8 +464,10 @@ protected:
     delay(Seconds(1), self(), &Self::doReliableRegistration);
   }
 
-  void resourceOffers(const vector<Offer>& offers,
-                      const vector<string>& pids)
+  void resourceOffers(
+      const UPID& from,
+      const vector<Offer>& offers,
+      const vector<string>& pids)
   {
     if (aborted) {
       VLOG(1) << "Ignoring resource offers message because "
@@ -453,6 +475,21 @@ protected:
       return;
     }
 
+    if (!connected) {
+      VLOG(1) << "Ignoring resource offers message because the driver is "
+              << "disconnected!";
+      return;
+    }
+
+    CHECK_SOME(master);
+
+    if (from != master.get()) {
+      VLOG(1) << "Ignoring resource offers message because it was sent "
+              << "from '" << from << "' instead of the leading master '"
+              << master.get() << "'";
+      return;
+    }
+
     VLOG(2) << "Received " << offers.size() << " offers";
 
     CHECK(offers.size() == pids.size());
@@ -480,7 +517,7 @@ protected:
     VLOG(1) << "Scheduler::resourceOffers took " << stopwatch.elapsed();
   }
 
-  void rescindOffer(const OfferID& offerId)
+  void rescindOffer(const UPID& from, const OfferID& offerId)
   {
     if (aborted) {
       VLOG(1) << "Ignoring rescind offer message because "
@@ -488,6 +525,21 @@ protected:
       return;
     }
 
+    if (!connected) {
+      VLOG(1) << "Ignoring rescind offer message because the driver is "
+              << "disconnected!";
+      return;
+    }
+
+    CHECK_SOME(master);
+
+    if (from != master.get()) {
+      VLOG(1) << "Ignoring rescind offer message because it was sent "
+              << "from '" << from << "' instead of the leading master '"
+              << master.get() << "'";
+      return;
+    }
+
     VLOG(1) << "Rescinded offer " << offerId;
 
     savedOffers.erase(offerId);
@@ -502,7 +554,10 @@ protected:
     VLOG(1) << "Scheduler::offerRescinded took " << stopwatch.elapsed();
   }
 
-  void statusUpdate(const StatusUpdate& update, const UPID& pid)
+  void statusUpdate(
+      const UPID& from,
+      const StatusUpdate& update,
+      const UPID& pid)
   {
     const TaskStatus& status = update.status();
 
@@ -512,6 +567,24 @@ protected:
       return;
     }
 
+    // Allow status updates created from the driver itself.
+    if (from != UPID()) {
+      if (!connected) {
+        VLOG(1) << "Ignoring status update message because the driver is "
+                << "disconnected!";
+        return;
+      }
+
+      CHECK_SOME(master);
+
+      if (from != master.get()) {
+        VLOG(1) << "Ignoring status update message because it was sent "
+                << "from '" << from << "' instead of the leading master '"
+                << master.get() << "'";
+        return;
+      }
+    }
+
     VLOG(2) << "Received status update " << update << " from " << pid;
 
     CHECK(framework.id() == update.framework_id());
@@ -569,10 +642,18 @@ protected:
       return;
     }
 
-    if (!master.isSome() || from != master.get()) {
-      LOG(WARNING) << "Ignoring lost slave message from " << from
-                   << " because it is not from the registered master ("
-                   << (master.isSome() ? master.get() : "NONE/ERROR") << ")";
+    if (!connected) {
+      VLOG(1) << "Ignoring lost slave message because the driver is "
+              << "disconnected!";
+      return;
+    }
+
+    CHECK_SOME(master);
+
+    if (from != master.get()) {
+      VLOG(1) << "Ignoring lost slave message because it was sent "
+              << "from '" << from << "' instead of the leading master '"
+              << master.get() << "'";
       return;
     }
 
@@ -732,7 +813,7 @@ protected:
         update.set_timestamp(Clock::now().secs());
         update.set_uuid(UUID::random().toBytes());
 
-        statusUpdate(update, UPID());
+        statusUpdate(UPID(), update, UPID());
       }
       return;
     }
@@ -753,7 +834,7 @@ protected:
         update.set_timestamp(Clock::now().secs());
         update.set_uuid(UUID::random().toBytes());
 
-        statusUpdate(update, UPID());
+        statusUpdate(UPID(), update, UPID());
         continue;
       }
 
@@ -773,7 +854,7 @@ protected:
         update.set_timestamp(Clock::now().secs());
         update.set_uuid(UUID::random().toBytes());
 
-        statusUpdate(update, UPID());
+        statusUpdate(UPID(), update, UPID());
         continue;
       }
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/57c2b728/src/tests/fault_tolerance_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/fault_tolerance_tests.cpp b/src/tests/fault_tolerance_tests.cpp
index 6cb5829..6fb140e 100644
--- a/src/tests/fault_tolerance_tests.cpp
+++ b/src/tests/fault_tolerance_tests.cpp
@@ -1768,3 +1768,92 @@ TEST_F(FaultToleranceTest, ReconcileIncompleteTasks)
 
   Shutdown();
 }
+
+
+// This test ensures that if a master incorrectly thinks that it is
+// leading, the scheduler driver will drop messages from this master.
+// Unfortunately, it is not currently possible to start more than one
+// master within the same process. So, this test merely simulates this
+// by spoofing messages.
+// This test does the following:
+//   1. Start a master, scheduler, launch a task.
+//   2. Spoof a lost task message for the slave.
+//   3. Once the message is sent to the scheduler, kill the task.
+//   4. Ensure the task was KILLED rather than LOST.
+TEST_F(FaultToleranceTest, SplitBrainMasters)
+{
+  // 1. Start a master, scheduler, and launch a task.
+  Try<PID<Master> > master = StartMaster();
+  ASSERT_SOME(master);
+
+  MockExecutor exec(DEFAULT_EXECUTOR_ID);
+
+  Try<PID<Slave> > slave = StartSlave(&exec);
+  ASSERT_SOME(slave);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
+
+  Future<Message> registered =
+    FUTURE_MESSAGE(Eq(FrameworkRegisteredMessage().GetTypeName()), _, _);
+
+  Future<FrameworkID> frameworkId;
+  EXPECT_CALL(sched, registered(&driver, _, _))
+    .WillOnce(FutureArg<1>(&frameworkId));
+
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(LaunchTasks(DEFAULT_EXECUTOR_INFO, 1, 1, 512, "*"))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  EXPECT_CALL(exec, registered(_, _, _, _));
+
+  EXPECT_CALL(exec, launchTask(_, _))
+    .WillOnce(SendStatusUpdateFromTask(TASK_RUNNING));
+
+  Future<TaskStatus> runningStatus;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&runningStatus));
+
+  driver.start();
+
+  AWAIT_READY(registered);
+  AWAIT_READY(frameworkId);
+  AWAIT_READY(runningStatus);
+  EXPECT_EQ(TASK_RUNNING, runningStatus.get().state());
+
+  // 2. Spoof a lost task message for the slave.
+  StatusUpdateMessage lostUpdate;
+  lostUpdate.mutable_update()->CopyFrom(createStatusUpdate(
+      frameworkId.get(),
+      runningStatus.get().slave_id(),
+      runningStatus.get().task_id(),
+      TASK_LOST));
+
+  // Spoof a message from a random master; this should be dropped by
+  // the scheduler driver. Since this is delivered locally, it is
+  // synchronously placed on the scheduler driver's queue.
+  process::post(UPID("master2@127.0.0.1:50"), registered.get().to, lostUpdate);
+
+  // 3. Once the message is sent to the scheduler, kill the task.
+  EXPECT_CALL(exec, killTask(_, _))
+    .WillOnce(SendStatusUpdateFromTaskID(TASK_KILLED));
+
+  Future<TaskStatus> killedStatus;
+  EXPECT_CALL(sched, statusUpdate(&driver, _))
+    .WillOnce(FutureArg<1>(&killedStatus));
+
+  driver.killTask(runningStatus.get().task_id());
+
+  // 4. Ensure the task was KILLED rather than LOST.
+  AWAIT_READY(killedStatus);
+  EXPECT_EQ(TASK_KILLED, killedStatus.get().state());
+
+  EXPECT_CALL(exec, shutdown(_))
+    .WillRepeatedly(Return());
+
+  driver.stop();
+  driver.join();
+
+  Shutdown();
+}


[2/2] git commit: Added build instructions for Mac OS X Mavericks.

Posted by bm...@apache.org.
Added build instructions for Mac OS X Mavericks.

From: Niklas Nielsen <ni...@qni.dk>
Review: https://reviews.apache.org/r/15727


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/718eb711
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/718eb711
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/718eb711

Branch: refs/heads/master
Commit: 718eb711cb8a2984536e3fee9e6ade53c9977bfc
Parents: 57c2b72
Author: Benjamin Mahler <bm...@twitter.com>
Authored: Mon Nov 25 15:57:16 2013 -0800
Committer: Benjamin Mahler <bm...@twitter.com>
Committed: Mon Nov 25 15:59:56 2013 -0800

----------------------------------------------------------------------
 docs/getting-started.md | 44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/718eb711/docs/getting-started.md
----------------------------------------------------------------------
diff --git a/docs/getting-started.md b/docs/getting-started.md
index 3986dab..3ab2d0f 100644
--- a/docs/getting-started.md
+++ b/docs/getting-started.md
@@ -56,6 +56,12 @@ If you are building from git repository, you will need to additionally install t
 
 > 2. Mesos is currently being developed/tested/supported on 64 Bit machines only.
 
+> 3. Currently, Mesos does not build with the Clang compiler. Up and until Mac OS X Mountain Lion, `gcc` and `g++` were GNU GCC compiler frontends for LLVM.
+This changed with Mac OS X Mavericks and `gcc` and `g++` are now Clang wrappers which compiles against a new standard C library.
+This unfortunately breaks our current build process, so please follow the separate instructions below to get Mesos building on Mavericks.
+
+> 4. Mesos does not build with automake 1.14. Please make sure an older automake, for example version 1.12, is installed.
+
 
 ## Building Mesos
 
@@ -77,6 +83,42 @@ If you are building from git repository, you will need to additionally install t
         # Install (***Optional***).
         $ make install
 
+### Building Mesos on Mac OS X Mavericks
+
+        # Make sure you have the right versions of automake and GCC.
+        # For example:
+        $ automake --version
+        automake (GNU automake) 1.12.6
+        ...
+
+        $ aclocal --version
+        aclocal (GNU automake) 1.12.6
+        ...
+
+        $ gcc-4.8 --version
+        gcc-4.8 (GCC) 4.8.1
+        ...
+
+        # Follow bootstrap steps above.
+
+        # Configure and build.
+        $ mkdir build
+        $ cd build
+
+        # The prepending CC and CXX will select custom compilers to build Mesos with.
+        $ CC=gcc-4.8 CXX=g++-4.8 ../configure
+
+        # Follow make steps above.
+
+***NOTES***
+
+> 1. Link errors might occur if clang versions of `gcc` and `g++` were used to compile some of the object files.
+This happens if configure was run without `CC` and `CXX` set and `make` subsequently failed.
+Make sure the build directory is completely empty before running configure.
+> 2. Python framework test failure is expected on Mavericks.
+This is due to the system Python binary usually being compiled against the new (non-GNU) standard C library.
+Either use a GCC built Python and select it with `PYTHON=<location> CC=gcc-4.8 CXX=g++-4.8 ../configure` or disable Python support with `CC=gcc-4.8 CXX=g++-4.8 ../configure --disable-python`.
+
 ## Examples
 Mesos comes bundled with example frameworks written in `C++`, `Java` and `Python`.
 
@@ -101,4 +143,4 @@ Mesos comes bundled with example frameworks written in `C++`, `Java` and `Python
         # Run Python framework (***Exits after successfully running some tasks.***).
         $ ./src/examples/python/test-framework 127.0.0.1:5050
 
-*NOTE: To build the example frameworks, make sure you build the test suite by doing `make check`.*
\ No newline at end of file
+*NOTE: To build the example frameworks, make sure you build the test suite by doing `make check`.*