You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2013/12/03 20:42:56 UTC

[1/2] git commit: Fixed MESOS-851: The drivers now ignore queued messages when aborted.

Updated Branches:
  refs/heads/master bfb16a2e1 -> 4173ec935


Fixed MESOS-851: The drivers now ignore queued messages when aborted.

Review: https://reviews.apache.org/r/15872


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/853c606c
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/853c606c
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/853c606c

Branch: refs/heads/master
Commit: 853c606c0a8de6346aeefac9b48874d4d4ac38d5
Parents: bfb16a2
Author: Benjamin Mahler <bm...@twitter.com>
Authored: Tue Nov 26 19:39:28 2013 -0800
Committer: Benjamin Mahler <bm...@twitter.com>
Committed: Tue Dec 3 11:26:18 2013 -0800

----------------------------------------------------------------------
 src/exec/exec.cpp   | 14 ++++++++++++--
 src/sched/sched.cpp | 25 +++++++++++++++++--------
 2 files changed, 29 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/853c606c/src/exec/exec.cpp
----------------------------------------------------------------------
diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp
index a58203b..26a927a 100644
--- a/src/exec/exec.cpp
+++ b/src/exec/exec.cpp
@@ -407,7 +407,7 @@ protected:
   void abort()
   {
     VLOG(1) << "De-activating the executor libprocess";
-    aborted = true;
+    CHECK(aborted);
 
     Lock lock(mutex);
     pthread_cond_signal(cond);
@@ -548,7 +548,7 @@ private:
   bool connected; // Registered with the slave.
   UUID connection; // UUID to identify the connection instance.
   bool local;
-  bool aborted;
+  volatile bool aborted;
   pthread_mutex_t* mutex;
   pthread_cond_t* cond;
   const string directory;
@@ -742,6 +742,16 @@ Status MesosExecutorDriver::abort()
 
   CHECK(process != NULL);
 
+  // We set the volatile aborted to true here to prevent any further
+  // messages from being processed in the ExecutorProcess. However,
+  // if abort() is called from another thread as the ExecutorProcess,
+  // there may be at most one additional message processed.
+  // TODO(bmahler): Use an atomic boolean.
+  process->aborted = true;
+
+  // Dispatching here ensures that we still process the outstanding
+  // requests *from* the executor, since those do proceed when
+  // aborted is true.
   dispatch(process, &ExecutorProcess::abort);
 
   return status = DRIVER_ABORTED;

http://git-wip-us.apache.org/repos/asf/mesos/blob/853c606c/src/sched/sched.cpp
----------------------------------------------------------------------
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index b958435..c465356 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -743,18 +743,17 @@ protected:
   {
     VLOG(1) << "Aborting framework '" << framework.id() << "'";
 
-    aborted = true;
+    CHECK(aborted);
 
     if (!connected) {
       VLOG(1) << "Not sending a deactivate message as master is disconnected";
-      return;
+    } else {
+      DeactivateFrameworkMessage message;
+      message.mutable_framework_id()->MergeFrom(framework.id());
+      CHECK_SOME(master);
+      send(master.get(), message);
     }
 
-    DeactivateFrameworkMessage message;
-    message.mutable_framework_id()->MergeFrom(framework.id());
-    CHECK_SOME(master);
-    send(master.get(), message);
-
     Lock lock(mutex);
     pthread_cond_signal(cond);
   }
@@ -982,7 +981,7 @@ private:
   bool failover;
   Result<UPID> master;
 
-  volatile bool connected; // Flag to indicate if framework is registered.
+  bool connected; // Flag to indicate if framework is registered.
   volatile bool aborted; // Flag to indicate if the driver is aborted.
 
   MasterDetector* detector;
@@ -1269,6 +1268,16 @@ Status MesosSchedulerDriver::abort()
 
   CHECK(process != NULL);
 
+  // We set the volatile aborted to true here to prevent any further
+  // messages from being processed in the SchedulerProcess. However,
+  // if abort() is called from another thread as the SchedulerProcess,
+  // there may be at most one additional message processed.
+  // TODO(bmahler): Use an atomic boolean.
+  process->aborted = true;
+
+  // Dispatching here ensures that we still process the outstanding
+  // requests *from* the scheduler, since those do proceed when
+  // aborted is true.
   dispatch(process, &SchedulerProcess::abort);
 
   return status = DRIVER_ABORTED;


[2/2] git commit: Fixed a flaky test: FaultToleranceTest.ReregisterFrameworkExitedExecutor.

Posted by bm...@apache.org.
Fixed a flaky test: FaultToleranceTest.ReregisterFrameworkExitedExecutor.

Review: https://reviews.apache.org/r/15955


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/4173ec93
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/4173ec93
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/4173ec93

Branch: refs/heads/master
Commit: 4173ec9358c7882eafdbf0dd8ef99173b3f05257
Parents: 853c606
Author: Benjamin Mahler <bm...@twitter.com>
Authored: Mon Dec 2 17:33:55 2013 -0800
Committer: Benjamin Mahler <bm...@twitter.com>
Committed: Tue Dec 3 11:36:59 2013 -0800

----------------------------------------------------------------------
 src/tests/fault_tolerance_tests.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/4173ec93/src/tests/fault_tolerance_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/fault_tolerance_tests.cpp b/src/tests/fault_tolerance_tests.cpp
index 27095cc..7157292 100644
--- a/src/tests/fault_tolerance_tests.cpp
+++ b/src/tests/fault_tolerance_tests.cpp
@@ -1175,17 +1175,18 @@ TEST_F(FaultToleranceTest, ReregisterFrameworkExitedExecutor)
   // Wait for the slave to re-register.
   AWAIT_READY(slaveReregisteredMessage);
 
-  // Allow the executor exited message and drop the status update.
+  // Allow the executor exited message and drop the status update,
+  // it's possible for a duplicate update to occur if the status
+  // update manager is notified of the new master after the task was
+  // killed.
   Future<ExitedExecutorMessage> executorExitedMessage =
     FUTURE_PROTOBUF(ExitedExecutorMessage(), _, _);
-  Future<StatusUpdateMessage> statusUpdateMessage =
-    DROP_PROTOBUF(StatusUpdateMessage(), _, _);
+  DROP_PROTOBUFS(StatusUpdateMessage(), _, _);
 
   // Now kill the executor.
   dispatch(isolator, &Isolator::killExecutor, frameworkId, DEFAULT_EXECUTOR_ID);
 
   AWAIT_READY(executorExitedMessage);
-  AWAIT_READY(statusUpdateMessage);
 
   // Now notify the framework of the new master.
   Future<FrameworkRegisteredMessage> frameworkRegisteredMessage2 =