You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2014/05/22 20:24:56 UTC

[1/2] git commit: Added retry for zookeeper_init per MESOS-1326.

Repository: mesos
Updated Branches:
  refs/heads/master cd707f1d3 -> 220da6b41


Added retry for zookeeper_init per MESOS-1326.

Often during temporary DNS failovers or outages, we see slaves
aborting in zookeeper_init.

In many cases, the slave can restart within 10 seconds. Since
retrying zookeeper_init should be safe to do, this is an attempt
to minimize the number of unnecessary aborts in the slave.

Review: https://reviews.apache.org/r/21783


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/9a398f72
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/9a398f72
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/9a398f72

Branch: refs/heads/master
Commit: 9a398f7222e077967d2484bfd5b51bd3482b419f
Parents: cd707f1
Author: Ben Mahler <be...@gmail.com>
Authored: Thu May 22 11:23:33 2014 -0700
Committer: Benjamin Mahler <bm...@twitter.com>
Committed: Thu May 22 11:23:34 2014 -0700

----------------------------------------------------------------------
 src/zookeeper/zookeeper.cpp | 38 +++++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/9a398f72/src/zookeeper/zookeeper.cpp
----------------------------------------------------------------------
diff --git a/src/zookeeper/zookeeper.cpp b/src/zookeeper/zookeeper.cpp
index 11029be..9c0c038 100644
--- a/src/zookeeper/zookeeper.cpp
+++ b/src/zookeeper/zookeeper.cpp
@@ -24,6 +24,7 @@
 #include <process/defer.hpp>
 #include <process/dispatch.hpp>
 #include <process/process.hpp>
+#include <process/timeout.hpp>
 
 #include <stout/duration.hpp>
 #include <stout/fatal.hpp>
@@ -39,6 +40,7 @@ using process::Future;
 using process::PID;
 using process::Process;
 using process::Promise;
+using process::Timeout;
 
 using std::map;
 using std::string;
@@ -67,13 +69,35 @@ public:
 
   virtual void initialize()
   {
-    zh = zookeeper_init(
-        servers.c_str(),
-        event,
-        static_cast<int>(timeout.ms()),
-        NULL,
-        &callback,
-        0);
+    // We retry zookeeper_init until the timeout elapses because we've
+    // seen cases where temporary DNS outages cause the slave to abort
+    // here. See MESOS-1326 for more information.
+    const Timeout timeout_ = Timeout::in(timeout);
+
+    while (!timeout_.expired()) {
+      zh = zookeeper_init(
+          servers.c_str(),
+          event,
+          static_cast<int>(timeout.ms()),
+          NULL,
+          &callback,
+          0);
+
+      // Unfortunately, EINVAL is highly overloaded in zookeeper_init
+      // and can correspond to:
+      //   (1) Empty / invalid 'host' string format.
+      //   (2) Any getaddrinfo error other than EAI_NONAME,
+      //       EAI_NODATA, and EAI_MEMORY are mapped to EINVAL.
+      // Either way, retrying is not problematic.
+      if (zh == NULL && errno == EINVAL) {
+        ErrnoError error("zookeeper_init failed");
+        LOG(WARNING) << error.message << " ; retrying in 1 second";
+        os::sleep(Seconds(1));
+        continue;
+      }
+
+      break;
+    }
 
     if (zh == NULL) {
       PLOG(FATAL) << "Failed to create ZooKeeper, zookeeper_init";


[2/2] git commit: Reduced acknowledgment delay in the scheduler driver.

Posted by bm...@apache.org.
Reduced acknowledgment delay in the scheduler driver.

We currently 'defer' in order to acknowledge processed status
updates in the scheduler driver.

This is unfortunate because it adds unnecessary delay and
unnecessary retries, especially when the scheduler's queue
is backed up.

It is sufficient to use 'aborted' to see if we can send a status
update. Ideally there would be proper locking on this variable
rather than the use of 'volatile', but since we're mostly concerned
about the driver aborting on the same thread as the status update
was processed, I've left it as is.

Review: https://reviews.apache.org/r/21788


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/220da6b4
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/220da6b4
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/220da6b4

Branch: refs/heads/master
Commit: 220da6b41fbfe7f301c1e238d68f2587315559f2
Parents: 9a398f7
Author: Ben Mahler <be...@gmail.com>
Authored: Thu May 22 11:24:08 2014 -0700
Committer: Benjamin Mahler <bm...@twitter.com>
Committed: Thu May 22 11:24:09 2014 -0700

----------------------------------------------------------------------
 src/sched/sched.cpp | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/220da6b4/src/sched/sched.cpp
----------------------------------------------------------------------
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index 3684cfe..9e2de7e 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -619,32 +619,26 @@ protected:
 
     VLOG(1) << "Scheduler::statusUpdate took " << stopwatch.elapsed();
 
-    // Acknowledge the status update.
-    // NOTE: We do a dispatch here instead of directly sending the ACK because,
-    // we want to avoid sending the ACK if the driver was aborted when we
-    // made the statusUpdate call. This works because, the 'abort' message will
-    // be enqueued before the ACK message is processed.
-    if (pid != UPID()) {
-      dispatch(self(), &Self::statusUpdateAcknowledgement, update, pid);
-    }
-  }
-
-  void statusUpdateAcknowledgement(const StatusUpdate& update, const UPID& pid)
-  {
+    // Note that we need to look at the volatile 'aborted' here to
+    // so that we don't acknowledge the update if the driver was
+    // aborted during the processing of the update.
     if (aborted) {
       VLOG(1) << "Not sending status update acknowledgment message because "
               << "the driver is aborted!";
       return;
     }
 
-    VLOG(2) << "Sending ACK for status update " << update << " to " << pid;
+    // Acknowledge the status update.
+    if (pid != UPID()) {
+      VLOG(2) << "Sending ACK for status update " << update << " to " << pid;
 
-    StatusUpdateAcknowledgementMessage message;
-    message.mutable_framework_id()->MergeFrom(framework.id());
-    message.mutable_slave_id()->MergeFrom(update.slave_id());
-    message.mutable_task_id()->MergeFrom(update.status().task_id());
-    message.set_uuid(update.uuid());
-    send(pid, message);
+      StatusUpdateAcknowledgementMessage message;
+      message.mutable_framework_id()->MergeFrom(framework.id());
+      message.mutable_slave_id()->MergeFrom(update.slave_id());
+      message.mutable_task_id()->MergeFrom(update.status().task_id());
+      message.set_uuid(update.uuid());
+      send(pid, message);
+    }
   }
 
   void lostSlave(const UPID& from, const SlaveID& slaveId)