You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by al...@apache.org on 2016/11/25 15:19:40 UTC

[01/11] mesos git commit: Refactored HealthChecker to never stop health checking.

Repository: mesos
Updated Branches:
  refs/heads/master 79cee8ad9 -> 08da1ee8a


Refactored HealthChecker to never stop health checking.

Prior to this patch, HealthChecker would stop performing health
checks after it marks the task for kill. Since tasks' lifecycle
is managed by scheduler-executor, HealthChecker should never stop
health checking on its own.

Allowing health checks to run forever enables the scheduler
make the decision about how to deal with unhealthy tasks.

Review: https://reviews.apache.org/r/52865/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/c428b8bc
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/c428b8bc
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/c428b8bc

Branch: refs/heads/master
Commit: c428b8bcf3d48ab9d3e2ef0cef5e3057619d7670
Parents: 79cee8a
Author: Alexander Rukletsov <ru...@gmail.com>
Authored: Fri Nov 25 16:13:00 2016 +0100
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Nov 25 16:13:00 2016 +0100

----------------------------------------------------------------------
 src/docker/executor.cpp             | 23 ++++++++---------------
 src/health-check/health_checker.cpp | 27 +++++++++++++--------------
 src/health-check/health_checker.hpp | 11 +++++------
 src/launcher/default_executor.cpp   | 15 +--------------
 src/launcher/executor.cpp           |  8 --------
 5 files changed, 27 insertions(+), 57 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/c428b8bc/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index eefbc0c..e50b340 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -526,13 +526,14 @@ private:
       namespaces.push_back("net");
     }
 
-    Try<Owned<health::HealthChecker>> _checker = health::HealthChecker::create(
-        healthCheck,
-        launcherDir,
-        self(),
-        task.task_id(),
-        containerPid,
-        namespaces);
+    Try<Owned<health::HealthChecker>> _checker =
+      health::HealthChecker::create(
+          healthCheck,
+          launcherDir,
+          self(),
+          task.task_id(),
+          containerPid,
+          namespaces);
 
     if (_checker.isError()) {
       // TODO(gilbert): Consider ABORT and return a TASK_FAILED here.
@@ -540,14 +541,6 @@ private:
            << _checker.error() << endl;
     } else {
       checker = _checker.get();
-
-      checker->healthCheck()
-        .onAny([](const Future<Nothing>& future) {
-          // Only possible to be a failure.
-          if (future.isFailed()) {
-            cerr << "Health check failed:" << future.failure() << endl;
-          }
-        });
     }
   }
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/c428b8bc/src/health-check/health_checker.cpp
----------------------------------------------------------------------
diff --git a/src/health-check/health_checker.cpp b/src/health-check/health_checker.cpp
index af5500b..c4f6670 100644
--- a/src/health-check/health_checker.cpp
+++ b/src/health-check/health_checker.cpp
@@ -31,6 +31,7 @@
 
 #include <process/collect.hpp>
 #include <process/delay.hpp>
+#include <process/dispatch.hpp>
 #include <process/http.hpp>
 #include <process/io.hpp>
 #include <process/subprocess.hpp>
@@ -53,11 +54,11 @@
 #endif
 
 using process::delay;
+using process::dispatch;
 using process::Clock;
 using process::Failure;
 using process::Future;
 using process::Owned;
-using process::Promise;
 using process::Subprocess;
 using process::Time;
 using process::UPID;
@@ -157,12 +158,6 @@ HealthChecker::~HealthChecker()
 }
 
 
-Future<Nothing> HealthChecker::healthCheck()
-{
-  return dispatch(process.get(), &HealthCheckerProcess::healthCheck);
-}
-
-
 HealthCheckerProcess::HealthCheckerProcess(
     const HealthCheck& _check,
     const string& _launcherDir,
@@ -188,7 +183,13 @@ HealthCheckerProcess::HealthCheckerProcess(
 }
 
 
-Future<Nothing> HealthCheckerProcess::healthCheck()
+void HealthCheckerProcess::initialize()
+{
+  healthCheck();
+}
+
+
+void HealthCheckerProcess::healthCheck()
 {
   VLOG(1) << "Health check starting in "
           << Seconds(static_cast<int64_t>(check.delay_seconds()))
@@ -200,7 +201,6 @@ Future<Nothing> HealthCheckerProcess::healthCheck()
   delay(Seconds(static_cast<int64_t>(check.delay_seconds())),
         self(),
         &Self::_healthCheck);
-  return promise.future();
 }
 
 
@@ -231,11 +231,10 @@ void HealthCheckerProcess::failure(const string& message)
   // not exit before the data is sent to the executor.
   send(executor, taskHealthStatus);
 
-  if (killTask) {
-    promise.fail(message);
-  } else {
-    reschedule();
-  }
+  // Even if we set the `kill_task` flag, it is an executor who kills the task
+  // and honors the flag (or not). We have no control over the task's lifetime,
+  // hence we should continue until we are explicitly asked to stop.
+  reschedule();
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/c428b8bc/src/health-check/health_checker.hpp
----------------------------------------------------------------------
diff --git a/src/health-check/health_checker.hpp b/src/health-check/health_checker.hpp
index 837d135..edd9bc7 100644
--- a/src/health-check/health_checker.hpp
+++ b/src/health-check/health_checker.hpp
@@ -45,7 +45,8 @@ class HealthChecker
 {
 public:
   /**
-   * Attempts to create a `HealthChecker` object.
+   * Attempts to create a `HealthChecker` object. In case of success, health
+   * checking starts immediately after initialization.
    *
    * @param check The protobuf message definition of health check.
    * @param executor The executor UPID to which health check results will be
@@ -68,8 +69,6 @@ public:
 
   ~HealthChecker();
 
-  process::Future<Nothing> healthCheck();
-
 private:
   explicit HealthChecker(process::Owned<HealthCheckerProcess> process);
 
@@ -90,14 +89,15 @@ public:
 
   virtual ~HealthCheckerProcess() {}
 
-  process::Future<Nothing> healthCheck();
+protected:
+  virtual void initialize() override;
 
 private:
   void failure(const std::string& message);
   void success();
 
+  void healthCheck();
   void _healthCheck();
-
   void __healthCheck(const process::Future<Nothing>& future);
 
   process::Future<Nothing> _commandHealthCheck();
@@ -120,7 +120,6 @@ private:
 
   void reschedule();
 
-  process::Promise<Nothing> promise;
   HealthCheck check;
   std::string launcherDir;
   bool initializing;

http://git-wip-us.apache.org/repos/asf/mesos/blob/c428b8bc/src/launcher/default_executor.cpp
----------------------------------------------------------------------
diff --git a/src/launcher/default_executor.cpp b/src/launcher/default_executor.cpp
index f4e1ea4..0c590c8 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -420,20 +420,7 @@ protected:
           return;
         }
 
-        Owned<health::HealthChecker> checker = _checker.get();
-
-        checker->healthCheck()
-          .onAny(defer(self(), [this, taskId](const Future<Nothing>& future) {
-            if (!future.isReady()) {
-              LOG(ERROR)
-                << "Health check for task '" << taskId << "' failed due to: "
-                << (future.isFailed() ? future.failure() : "discarded");
-
-              __shutdown();
-            }
-          }));
-
-        checkers.push_back(checker);
+        checkers.push_back(_checker.get());
       }
 
       // Currently, the Mesos agent does not expose the mapping from

http://git-wip-us.apache.org/repos/asf/mesos/blob/c428b8bc/src/launcher/executor.cpp
----------------------------------------------------------------------
diff --git a/src/launcher/executor.cpp b/src/launcher/executor.cpp
index ce0b199..ec1f7a0 100644
--- a/src/launcher/executor.cpp
+++ b/src/launcher/executor.cpp
@@ -465,14 +465,6 @@ protected:
              << _checker.error() << endl;
       } else {
         checker = _checker.get();
-
-        checker->healthCheck()
-          .onAny([](const Future<Nothing>& future) {
-            // Only possible to be a failure.
-            if (future.isFailed()) {
-              cerr << "Health check failed" << endl;
-            }
-          });
       }
     }
 


[07/11] mesos git commit: Ensured docker executor stops health checking terminated tasks.

Posted by al...@apache.org.
Ensured docker executor stops health checking terminated tasks.

We stop health checking both when the task is reaped and
killed since these may be two different execution paths.

Review: https://reviews.apache.org/r/52870/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/2814d1e6
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/2814d1e6
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/2814d1e6

Branch: refs/heads/master
Commit: 2814d1e6135a85aeb30befd812264d80192f4b94
Parents: 2edfcd3
Author: Alexander Rukletsov <ru...@gmail.com>
Authored: Fri Nov 25 16:14:15 2016 +0100
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Nov 25 16:14:15 2016 +0100

----------------------------------------------------------------------
 src/docker/executor.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/2814d1e6/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index e50b340..e9d65ef 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -297,7 +297,7 @@ protected:
 
     // This check prevents us from sending `TASK_RUNNING` updates
     // after the task has been transitioned to `TASK_KILLING`.
-    if (killed) {
+    if (killed || terminated) {
       return;
     }
 
@@ -376,6 +376,11 @@ private:
         driver.get()->sendStatusUpdate(status);
       }
 
+      // Stop health checking the task.
+      if (checker.get() != nullptr) {
+        checker->stop();
+      }
+
       // TODO(bmahler): Replace this with 'docker kill' so
       // that we can adjust the grace period in the case of
       // a `KillPolicy` override.
@@ -387,6 +392,11 @@ private:
   {
     terminated = true;
 
+    // Stop health checking the task.
+    if (checker.get() != nullptr) {
+      checker->stop();
+    }
+
     // In case the stop is stuck, discard it.
     stop.discard();
 


[05/11] mesos git commit: Health checks may be stopped on demand.

Posted by al...@apache.org.
Health checks may be stopped on demand.

Review: https://reviews.apache.org/r/52868/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/bd6186d2
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/bd6186d2
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/bd6186d2

Branch: refs/heads/master
Commit: bd6186d20d6ffe5a565ea29a08e1e0eb86873e63
Parents: 3da7397
Author: Alexander Rukletsov <ru...@gmail.com>
Authored: Fri Nov 25 16:13:54 2016 +0100
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Nov 25 16:13:54 2016 +0100

----------------------------------------------------------------------
 src/health-check/health_checker.cpp | 8 ++++++++
 src/health-check/health_checker.hpp | 5 +++++
 2 files changed, 13 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/bd6186d2/src/health-check/health_checker.cpp
----------------------------------------------------------------------
diff --git a/src/health-check/health_checker.cpp b/src/health-check/health_checker.cpp
index e66c9df..b769ecd 100644
--- a/src/health-check/health_checker.cpp
+++ b/src/health-check/health_checker.cpp
@@ -158,6 +158,14 @@ HealthChecker::~HealthChecker()
 }
 
 
+void HealthChecker::stop()
+{
+  LOG(INFO) << "Health checking stopped";
+
+  terminate(process.get(), true);
+}
+
+
 HealthCheckerProcess::HealthCheckerProcess(
     const HealthCheck& _check,
     const string& _launcherDir,

http://git-wip-us.apache.org/repos/asf/mesos/blob/bd6186d2/src/health-check/health_checker.hpp
----------------------------------------------------------------------
diff --git a/src/health-check/health_checker.hpp b/src/health-check/health_checker.hpp
index 733d2bd..bd7b753 100644
--- a/src/health-check/health_checker.hpp
+++ b/src/health-check/health_checker.hpp
@@ -70,6 +70,11 @@ public:
 
   ~HealthChecker();
 
+  /**
+   * Immediately stops health checking. Any in-flight health checks are dropped.
+   */
+  void stop();
+
 private:
   explicit HealthChecker(process::Owned<HealthCheckerProcess> process);
 


[02/11] mesos git commit: Renamed functions in HealthChecker for clarity.

Posted by al...@apache.org.
Renamed functions in HealthChecker for clarity.

Use descriptive function names instead of underscore
prefixes for functions in HealthChecker.

Review: https://reviews.apache.org/r/54004/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/89e41289
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/89e41289
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/89e41289

Branch: refs/heads/master
Commit: 89e41289ab1df3ab9328ffa0e2a5c7396e3e59eb
Parents: c428b8b
Author: Alexander Rukletsov <ru...@gmail.com>
Authored: Fri Nov 25 16:13:24 2016 +0100
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Nov 25 16:13:24 2016 +0100

----------------------------------------------------------------------
 src/health-check/health_checker.cpp | 36 +++++++++++++-------------------
 src/health-check/health_checker.hpp | 15 +++++++------
 2 files changed, 22 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/89e41289/src/health-check/health_checker.cpp
----------------------------------------------------------------------
diff --git a/src/health-check/health_checker.cpp b/src/health-check/health_checker.cpp
index c4f6670..4a02e86 100644
--- a/src/health-check/health_checker.cpp
+++ b/src/health-check/health_checker.cpp
@@ -185,12 +185,6 @@ HealthCheckerProcess::HealthCheckerProcess(
 
 void HealthCheckerProcess::initialize()
 {
-  healthCheck();
-}
-
-
-void HealthCheckerProcess::healthCheck()
-{
   VLOG(1) << "Health check starting in "
           << Seconds(static_cast<int64_t>(check.delay_seconds()))
           << ", grace period "
@@ -200,7 +194,7 @@ void HealthCheckerProcess::healthCheck()
 
   delay(Seconds(static_cast<int64_t>(check.delay_seconds())),
         self(),
-        &Self::_healthCheck);
+        &Self::performSingleCheck);
 }
 
 
@@ -257,23 +251,23 @@ void HealthCheckerProcess::success()
 }
 
 
-void HealthCheckerProcess::_healthCheck()
+void HealthCheckerProcess::performSingleCheck()
 {
   Future<Nothing> checkResult;
 
   switch (check.type()) {
     case HealthCheck::COMMAND: {
-      checkResult = _commandHealthCheck();
+      checkResult = commandHealthCheck();
       break;
     }
 
     case HealthCheck::HTTP: {
-      checkResult = _httpHealthCheck();
+      checkResult = httpHealthCheck();
       break;
     }
 
     case HealthCheck::TCP: {
-      checkResult = _tcpHealthCheck();
+      checkResult = tcpHealthCheck();
       break;
     }
 
@@ -282,11 +276,11 @@ void HealthCheckerProcess::_healthCheck()
     }
   }
 
-  checkResult.onAny(defer(self(), &Self::__healthCheck, lambda::_1));
+  checkResult.onAny(defer(self(), &Self::processCheckResult, lambda::_1));
 }
 
 
-void HealthCheckerProcess::__healthCheck(const Future<Nothing>& future)
+void HealthCheckerProcess::processCheckResult(const Future<Nothing>& future)
 {
   if (future.isReady()) {
     success();
@@ -301,7 +295,7 @@ void HealthCheckerProcess::__healthCheck(const Future<Nothing>& future)
 }
 
 
-Future<Nothing> HealthCheckerProcess::_commandHealthCheck()
+Future<Nothing> HealthCheckerProcess::commandHealthCheck()
 {
   CHECK_EQ(HealthCheck::COMMAND, check.type());
   CHECK(check.has_command());
@@ -387,7 +381,7 @@ Future<Nothing> HealthCheckerProcess::_commandHealthCheck()
 }
 
 
-Future<Nothing> HealthCheckerProcess::_httpHealthCheck()
+Future<Nothing> HealthCheckerProcess::httpHealthCheck()
 {
   CHECK_EQ(HealthCheck::HTTP, check.type());
   CHECK(check.has_http());
@@ -452,11 +446,11 @@ Future<Nothing> HealthCheckerProcess::_httpHealthCheck()
           string(HTTP_CHECK_COMMAND) + " has not returned after " +
           stringify(timeout) + "; aborting");
     })
-    .then(defer(self(), &Self::__httpHealthCheck, lambda::_1));
+    .then(defer(self(), &Self::_httpHealthCheck, lambda::_1));
 }
 
 
-Future<Nothing> HealthCheckerProcess::__httpHealthCheck(
+Future<Nothing> HealthCheckerProcess::_httpHealthCheck(
     const tuple<
         Future<Option<int>>,
         Future<string>,
@@ -515,7 +509,7 @@ Future<Nothing> HealthCheckerProcess::__httpHealthCheck(
 }
 
 
-Future<Nothing> HealthCheckerProcess::_tcpHealthCheck()
+Future<Nothing> HealthCheckerProcess::tcpHealthCheck()
 {
   CHECK_EQ(HealthCheck::TCP, check.type());
   CHECK(check.has_tcp());
@@ -575,11 +569,11 @@ Future<Nothing> HealthCheckerProcess::_tcpHealthCheck()
           string(TCP_CHECK_COMMAND) + " has not returned after " +
           stringify(timeout) + "; aborting");
     })
-    .then(defer(self(), &Self::__tcpHealthCheck, lambda::_1));
+    .then(defer(self(), &Self::_tcpHealthCheck, lambda::_1));
 }
 
 
-Future<Nothing> HealthCheckerProcess::__tcpHealthCheck(
+Future<Nothing> HealthCheckerProcess::_tcpHealthCheck(
     const tuple<
         Future<Option<int>>,
         Future<string>,
@@ -623,7 +617,7 @@ void HealthCheckerProcess::reschedule()
 
   delay(Seconds(static_cast<int64_t>(check.interval_seconds())),
         self(),
-        &Self::_healthCheck);
+        &Self::performSingleCheck);
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/89e41289/src/health-check/health_checker.hpp
----------------------------------------------------------------------
diff --git a/src/health-check/health_checker.hpp b/src/health-check/health_checker.hpp
index edd9bc7..ce0a232 100644
--- a/src/health-check/health_checker.hpp
+++ b/src/health-check/health_checker.hpp
@@ -96,23 +96,22 @@ private:
   void failure(const std::string& message);
   void success();
 
-  void healthCheck();
-  void _healthCheck();
-  void __healthCheck(const process::Future<Nothing>& future);
+  void performSingleCheck();
+  void processCheckResult(const process::Future<Nothing>& future);
 
-  process::Future<Nothing> _commandHealthCheck();
+  process::Future<Nothing> commandHealthCheck();
 
-  process::Future<Nothing> _httpHealthCheck();
+  process::Future<Nothing> httpHealthCheck();
 
-  process::Future<Nothing> __httpHealthCheck(
+  process::Future<Nothing> _httpHealthCheck(
       const std::tuple<
           process::Future<Option<int>>,
           process::Future<std::string>,
           process::Future<std::string>>& t);
 
-  process::Future<Nothing> _tcpHealthCheck();
+  process::Future<Nothing> tcpHealthCheck();
 
-  process::Future<Nothing> __tcpHealthCheck(
+  process::Future<Nothing> _tcpHealthCheck(
       const std::tuple<
           process::Future<Option<int>>,
           process::Future<std::string>,


[03/11] mesos git commit: Refactored HealthChecker::reschedule to take duration as an argument.

Posted by al...@apache.org.
Refactored HealthChecker::reschedule to take duration as an argument.

To facilitate code reuse, HealthChecker::reschedule() is generalized.
This will become even more valuable when we add pause/resume functions.

Review: https://reviews.apache.org/r/52866/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/bbe3b640
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/bbe3b640
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/bbe3b640

Branch: refs/heads/master
Commit: bbe3b6405c7817e3e0f8e2b5b93953325188b42a
Parents: 89e4128
Author: Alexander Rukletsov <ru...@gmail.com>
Authored: Fri Nov 25 16:13:32 2016 +0100
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Nov 25 16:13:32 2016 +0100

----------------------------------------------------------------------
 src/health-check/health_checker.cpp | 21 ++++++++-------------
 src/health-check/health_checker.hpp |  3 ++-
 2 files changed, 10 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/bbe3b640/src/health-check/health_checker.cpp
----------------------------------------------------------------------
diff --git a/src/health-check/health_checker.cpp b/src/health-check/health_checker.cpp
index 4a02e86..c118ca5 100644
--- a/src/health-check/health_checker.cpp
+++ b/src/health-check/health_checker.cpp
@@ -185,16 +185,14 @@ HealthCheckerProcess::HealthCheckerProcess(
 
 void HealthCheckerProcess::initialize()
 {
-  VLOG(1) << "Health check starting in "
+  VLOG(1) << "Health check starts in "
           << Seconds(static_cast<int64_t>(check.delay_seconds()))
           << ", grace period "
           << Seconds(static_cast<int64_t>(check.grace_period_seconds()));
 
   startTime = Clock::now();
 
-  delay(Seconds(static_cast<int64_t>(check.delay_seconds())),
-        self(),
-        &Self::performSingleCheck);
+  scheduleNext(Seconds(static_cast<int64_t>(check.delay_seconds())));
 }
 
 
@@ -204,7 +202,7 @@ void HealthCheckerProcess::failure(const string& message)
       check.grace_period_seconds() > 0 &&
       (Clock::now() - startTime).secs() <= check.grace_period_seconds()) {
     LOG(INFO) << "Ignoring failure as health check still in grace period";
-    reschedule();
+    scheduleNext(Seconds(static_cast<int64_t>(check.interval_seconds())));
     return;
   }
 
@@ -228,7 +226,7 @@ void HealthCheckerProcess::failure(const string& message)
   // Even if we set the `kill_task` flag, it is an executor who kills the task
   // and honors the flag (or not). We have no control over the task's lifetime,
   // hence we should continue until we are explicitly asked to stop.
-  reschedule();
+  scheduleNext(Seconds(static_cast<int64_t>(check.interval_seconds())));
 }
 
 
@@ -247,7 +245,7 @@ void HealthCheckerProcess::success()
   }
 
   consecutiveFailures = 0;
-  reschedule();
+  scheduleNext(Seconds(static_cast<int64_t>(check.interval_seconds())));
 }
 
 
@@ -610,14 +608,11 @@ Future<Nothing> HealthCheckerProcess::_tcpHealthCheck(
 }
 
 
-void HealthCheckerProcess::reschedule()
+void HealthCheckerProcess::scheduleNext(const Duration& duration)
 {
-  VLOG(1) << "Rescheduling health check in "
-          << Seconds(static_cast<int64_t>(check.interval_seconds()));
+  VLOG(1) << "Scheduling health check in " << duration;
 
-  delay(Seconds(static_cast<int64_t>(check.interval_seconds())),
-        self(),
-        &Self::performSingleCheck);
+  delay(duration, self(), &Self::performSingleCheck);
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/bbe3b640/src/health-check/health_checker.hpp
----------------------------------------------------------------------
diff --git a/src/health-check/health_checker.hpp b/src/health-check/health_checker.hpp
index ce0a232..31ed744 100644
--- a/src/health-check/health_checker.hpp
+++ b/src/health-check/health_checker.hpp
@@ -30,6 +30,7 @@
 #include <process/protobuf.hpp>
 #include <process/time.hpp>
 
+#include <stout/duration.hpp>
 #include <stout/nothing.hpp>
 
 #include "messages/messages.hpp"
@@ -117,7 +118,7 @@ private:
           process::Future<std::string>,
           process::Future<std::string>>& t);
 
-  void reschedule();
+  void scheduleNext(const Duration& duration);
 
   HealthCheck check;
   std::string launcherDir;


[09/11] mesos git commit: Used callback instead of `send()` for health status updates.

Posted by al...@apache.org.
Used callback instead of `send()` for health status updates.

Since HealthChecker is now used as a library only and does not live
in a separate OS process, there is no need to use libprocess message
sending for health status updates; a callback will do.

Review: https://reviews.apache.org/r/52872/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/14b5e94d
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/14b5e94d
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/14b5e94d

Branch: refs/heads/master
Commit: 14b5e94dbecde5f2e72ce64d7c8e0384e746ef3d
Parents: 46e348d
Author: Alexander Rukletsov <ru...@gmail.com>
Authored: Fri Nov 25 16:14:29 2016 +0100
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Nov 25 16:14:29 2016 +0100

----------------------------------------------------------------------
 src/docker/executor.cpp             | 27 ++++++++-------------------
 src/health-check/health_checker.cpp | 13 ++++++-------
 src/health-check/health_checker.hpp | 14 +++++++++-----
 src/launcher/default_executor.cpp   | 29 +++++++++++------------------
 src/launcher/executor.cpp           | 21 ++++++---------------
 5 files changed, 40 insertions(+), 64 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/14b5e94d/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index e9d65ef..94e116f 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -32,6 +32,7 @@
 #include <stout/error.hpp>
 #include <stout/flags.hpp>
 #include <stout/json.hpp>
+#include <stout/lambda.hpp>
 #include <stout/os.hpp>
 #include <stout/protobuf.hpp>
 #include <stout/try.hpp>
@@ -277,19 +278,7 @@ public:
   void error(ExecutorDriver* driver, const string& message) {}
 
 protected:
-  virtual void initialize()
-  {
-    install<TaskHealthStatus>(
-        &Self::taskHealthUpdated,
-        &TaskHealthStatus::task_id,
-        &TaskHealthStatus::healthy,
-        &TaskHealthStatus::kill_task);
-  }
-
-  void taskHealthUpdated(
-      const TaskID& taskID,
-      const bool& healthy,
-      const bool& initiateTaskKill)
+  void taskHealthUpdated(const TaskHealthStatus& healthStatus)
   {
     if (driver.isNone()) {
       return;
@@ -302,11 +291,11 @@ protected:
     }
 
     cout << "Received task health update, healthy: "
-         << stringify(healthy) << endl;
+         << stringify(healthStatus.healthy()) << endl;
 
     TaskStatus status;
-    status.mutable_task_id()->CopyFrom(taskID);
-    status.set_healthy(healthy);
+    status.mutable_task_id()->CopyFrom(healthStatus.task_id());
+    status.set_healthy(healthStatus.healthy());
     status.set_state(TASK_RUNNING);
 
     if (containerNetworkInfo.isSome()) {
@@ -316,9 +305,9 @@ protected:
 
     driver.get()->sendStatusUpdate(status);
 
-    if (initiateTaskKill) {
+    if (healthStatus.kill_task()) {
       killedByHealthCheck = true;
-      killTask(driver.get(), taskID);
+      killTask(driver.get(), healthStatus.task_id());
     }
   }
 
@@ -540,7 +529,7 @@ private:
       health::HealthChecker::create(
           healthCheck,
           launcherDir,
-          self(),
+          defer(self(), &Self::taskHealthUpdated, lambda::_1),
           task.task_id(),
           containerPid,
           namespaces);

http://git-wip-us.apache.org/repos/asf/mesos/blob/14b5e94d/src/health-check/health_checker.cpp
----------------------------------------------------------------------
diff --git a/src/health-check/health_checker.cpp b/src/health-check/health_checker.cpp
index b769ecd..7cdf5df 100644
--- a/src/health-check/health_checker.cpp
+++ b/src/health-check/health_checker.cpp
@@ -61,7 +61,6 @@ using process::Future;
 using process::Owned;
 using process::Subprocess;
 using process::Time;
-using process::UPID;
 
 using std::map;
 using std::string;
@@ -120,7 +119,7 @@ pid_t cloneWithSetns(
 Try<Owned<HealthChecker>> HealthChecker::create(
     const HealthCheck& check,
     const string& launcherDir,
-    const UPID& executor,
+    const lambda::function<void(const TaskHealthStatus&)>& callback,
     const TaskID& taskID,
     Option<pid_t> taskPid,
     const vector<string>& namespaces)
@@ -134,7 +133,7 @@ Try<Owned<HealthChecker>> HealthChecker::create(
   Owned<HealthCheckerProcess> process(new HealthCheckerProcess(
       check,
       launcherDir,
-      executor,
+      callback,
       taskID,
       taskPid,
       namespaces));
@@ -169,15 +168,15 @@ void HealthChecker::stop()
 HealthCheckerProcess::HealthCheckerProcess(
     const HealthCheck& _check,
     const string& _launcherDir,
-    const UPID& _executor,
+    const lambda::function<void(const TaskHealthStatus&)>& _callback,
     const TaskID& _taskID,
     Option<pid_t> _taskPid,
     const vector<string>& _namespaces)
   : ProcessBase(process::ID::generate("health-checker")),
     check(_check),
+    healthUpdateCallback(_callback),
     launcherDir(_launcherDir),
     initializing(true),
-    executor(_executor),
     taskID(_taskID),
     taskPid(_taskPid),
     namespaces(_namespaces),
@@ -243,7 +242,7 @@ void HealthCheckerProcess::failure(const string& message)
   // We assume this is a local send, i.e. the health checker library
   // is not used in a binary external to the executor and hence can
   // not exit before the data is sent to the executor.
-  send(executor, taskHealthStatus);
+  healthUpdateCallback(taskHealthStatus);
 
   // Even if we set the `kill_task` flag, it is an executor who kills the task
   // and honors the flag (or not). We have no control over the task's lifetime,
@@ -262,7 +261,7 @@ void HealthCheckerProcess::success()
     TaskHealthStatus taskHealthStatus;
     taskHealthStatus.set_healthy(true);
     taskHealthStatus.mutable_task_id()->CopyFrom(taskID);
-    send(executor, taskHealthStatus);
+    healthUpdateCallback(taskHealthStatus);
     initializing = false;
   }
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/14b5e94d/src/health-check/health_checker.hpp
----------------------------------------------------------------------
diff --git a/src/health-check/health_checker.hpp b/src/health-check/health_checker.hpp
index bd7b753..8d410a8 100644
--- a/src/health-check/health_checker.hpp
+++ b/src/health-check/health_checker.hpp
@@ -31,6 +31,7 @@
 #include <process/time.hpp>
 
 #include <stout/duration.hpp>
+#include <stout/lambda.hpp>
 #include <stout/nothing.hpp>
 
 #include "messages/messages.hpp"
@@ -50,20 +51,23 @@ public:
    * checking starts immediately after initialization.
    *
    * @param check The protobuf message definition of health check.
-   * @param executor The executor UPID to which health check results will be
-   *     reported.
    * @param launcherDir A directory where Mesos helper binaries are located.
+   * @param callback A callback HealthChecker uses to send health status
+   *     updates to its owner (usually an executor).
    * @param taskID The TaskID of the target task.
    * @param taskPid The target task's pid used to enter the specified
    *     namespaces.
    * @param namespaces The namespaces to enter prior performing a single health
    *     check.
    * @return A `HealthChecker` object or an error if `create` fails.
+   *
+   * @todo A better approach would be to return a stream of updates, e.g.,
+   * `process::Stream<TaskHealthStatus>` rather than invoking a callback.
    */
   static Try<process::Owned<HealthChecker>> create(
       const HealthCheck& check,
       const std::string& launcherDir,
-      const process::UPID& executor,
+      const lambda::function<void(const TaskHealthStatus&)>& callback,
       const TaskID& taskID,
       Option<pid_t> taskPid,
       const std::vector<std::string>& namespaces);
@@ -88,7 +92,7 @@ public:
   HealthCheckerProcess(
       const HealthCheck& _check,
       const std::string& _launcherDir,
-      const process::UPID& _executor,
+      const lambda::function<void(const TaskHealthStatus&)>& _callback,
       const TaskID& _taskID,
       Option<pid_t> _taskPid,
       const std::vector<std::string>& _namespaces);
@@ -131,9 +135,9 @@ private:
   Duration checkGracePeriod;
   Duration checkTimeout;
 
+  lambda::function<void(const TaskHealthStatus&)> healthUpdateCallback;
   std::string launcherDir;
   bool initializing;
-  process::UPID executor;
   TaskID taskID;
   Option<pid_t> taskPid;
   std::vector<std::string> namespaces;

http://git-wip-us.apache.org/repos/asf/mesos/blob/14b5e94d/src/launcher/default_executor.cpp
----------------------------------------------------------------------
diff --git a/src/launcher/default_executor.cpp b/src/launcher/default_executor.cpp
index 99a1f5e..6e377d4 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -37,6 +37,7 @@
 
 #include <stout/flags.hpp>
 #include <stout/fs.hpp>
+#include <stout/lambda.hpp>
 #include <stout/linkedhashmap.hpp>
 #include <stout/option.hpp>
 #include <stout/os.hpp>
@@ -201,12 +202,6 @@ public:
 protected:
   virtual void initialize()
   {
-    install<TaskHealthStatus>(
-        &Self::taskHealthUpdated,
-        &TaskHealthStatus::task_id,
-        &TaskHealthStatus::healthy,
-        &TaskHealthStatus::kill_task);
-
     mesos.reset(new Mesos(
         contentType,
         defer(self(), &Self::connected),
@@ -407,7 +402,7 @@ protected:
           health::HealthChecker::create(
               task.health_check(),
               launcherDirectory,
-              self(),
+              defer(self(), &Self::taskHealthUpdated, lambda::_1),
               taskId,
               None(),
               vector<string>());
@@ -830,27 +825,25 @@ protected:
     shutdown();
   }
 
-  void taskHealthUpdated(
-      const TaskID& taskId,
-      bool healthy,
-      bool initiateTaskKill)
+  void taskHealthUpdated(const TaskHealthStatus& healthStatus)
   {
     // This prevents us from sending `TASK_RUNNING` after a terminal status
     // update, because we may receive an update from a health check scheduled
     // before the task has been waited on.
-    if (!checkers.contains(taskId)) {
+    if (!checkers.contains(healthStatus.task_id())) {
       return;
     }
 
-    LOG(INFO) << "Received task health update for task '" << taskId
-              << "', task is "
-              << (healthy ? "healthy" : "not healthy");
+    LOG(INFO) << "Received task health update for task"
+              << " '" << healthStatus.task_id() << "', task is "
+              << (healthStatus.healthy() ? "healthy" : "not healthy");
 
-    update(taskId, TASK_RUNNING, None(), healthy);
+    update(
+        healthStatus.task_id(), TASK_RUNNING, None(), healthStatus.healthy());
 
-    if (initiateTaskKill) {
+    if (healthStatus.kill_task()) {
       unhealthy = true;
-      killTask(taskId);
+      killTask(healthStatus.task_id());
     }
   }
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/14b5e94d/src/launcher/executor.cpp
----------------------------------------------------------------------
diff --git a/src/launcher/executor.cpp b/src/launcher/executor.cpp
index 38fe4fb..cd5aa46 100644
--- a/src/launcher/executor.cpp
+++ b/src/launcher/executor.cpp
@@ -259,12 +259,6 @@ public:
 protected:
   virtual void initialize()
   {
-    install<TaskHealthStatus>(
-        &CommandExecutor::taskHealthUpdated,
-        &TaskHealthStatus::task_id,
-        &TaskHealthStatus::healthy,
-        &TaskHealthStatus::kill_task);
-
     Option<string> value = os::getenv("MESOS_HTTP_COMMAND_EXECUTOR");
 
     // We initialize the library here to ensure that callbacks are only invoked
@@ -297,10 +291,7 @@ protected:
     }
   }
 
-  void taskHealthUpdated(
-      const TaskID& _taskId,
-      const bool healthy,
-      const bool initiateTaskKill)
+  void taskHealthUpdated(const TaskHealthStatus& healthStatus)
   {
     // This check prevents us from sending `TASK_RUNNING` updates
     // after the task has been transitioned to `TASK_KILLING`.
@@ -309,13 +300,13 @@ protected:
     }
 
     cout << "Received task health update, healthy: "
-         << stringify(healthy) << endl;
+         << stringify(healthStatus.healthy()) << endl;
 
-    update(_taskId, TASK_RUNNING, healthy);
+    update(healthStatus.task_id(), TASK_RUNNING, healthStatus.healthy());
 
-    if (initiateTaskKill) {
+    if (healthStatus.kill_task()) {
       killedByHealthCheck = true;
-      kill(_taskId);
+      kill(healthStatus.task_id());
     }
   }
 
@@ -454,7 +445,7 @@ protected:
         health::HealthChecker::create(
             task->health_check(),
             launcherDir,
-            self(),
+            defer(self(), &Self::taskHealthUpdated, lambda::_1),
             task->task_id(),
             pid,
             namespaces);


[08/11] mesos git commit: Ensured default executor ignores health updates for terminated tasks.

Posted by al...@apache.org.
Ensured default executor ignores health updates for terminated tasks.

After the task has been terminated, its health updates become
irrelevant and should be ignored. Also if the default executor
shuts down, we can safely stop all health checkers.

Technically health checking should be stopped right before
TASK_KILLING update is sent to avoid subsequent TASK_RUNNING
updates, but the default executor currently does not support
TASK_KILLING.

Review: https://reviews.apache.org/r/52871/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/46e348de
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/46e348de
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/46e348de

Branch: refs/heads/master
Commit: 46e348de89c5cc8998068b8e395cd8f2cb492873
Parents: 2814d1e
Author: Alexander Rukletsov <ru...@gmail.com>
Authored: Fri Nov 25 16:14:19 2016 +0100
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Nov 25 16:14:19 2016 +0100

----------------------------------------------------------------------
 src/launcher/default_executor.cpp | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/46e348de/src/launcher/default_executor.cpp
----------------------------------------------------------------------
diff --git a/src/launcher/default_executor.cpp b/src/launcher/default_executor.cpp
index 0c590c8..99a1f5e 100644
--- a/src/launcher/default_executor.cpp
+++ b/src/launcher/default_executor.cpp
@@ -420,7 +420,7 @@ protected:
           return;
         }
 
-        checkers.push_back(_checker.get());
+        checkers[taskId] = _checker.get();
       }
 
       // Currently, the Mesos agent does not expose the mapping from
@@ -617,6 +617,17 @@ protected:
       deserialize<agent::Response>(contentType, response->body);
     CHECK_SOME(waitResponse);
 
+    // If the task has been health checked, stop the associated checker.
+    //
+    // TODO(alexr): Once we support `TASK_KILLING` in this executor, health
+    // checking should be stopped right before sending the `TASK_KILLING`
+    // update to avoid subsequent `TASK_RUNNING` updates.
+    if (checkers.contains(taskId)) {
+      CHECK_NOTNULL(checkers.at(taskId).get());
+      checkers.at(taskId)->stop();
+      checkers.erase(taskId);
+    }
+
     TaskState taskState;
     Option<string> message;
 
@@ -682,6 +693,12 @@ protected:
 
     shuttingDown = true;
 
+    // Stop health checking all tasks because we are shutting down.
+    foreach (const Owned<health::HealthChecker>& checker, checkers.values()) {
+      checker->stop();
+    }
+    checkers.clear();
+
     if (!launched) {
       __shutdown();
       return;
@@ -818,6 +835,13 @@ protected:
       bool healthy,
       bool initiateTaskKill)
   {
+    // This prevents us from sending `TASK_RUNNING` after a terminal status
+    // update, because we may receive an update from a health check scheduled
+    // before the task has been waited on.
+    if (!checkers.contains(taskId)) {
+      return;
+    }
+
     LOG(INFO) << "Received task health update for task '" << taskId
               << "', task is "
               << (healthy ? "healthy" : "not healthy");
@@ -1009,7 +1033,7 @@ private:
   // a `connected()` callback.
   Option<UUID> connectionId;
 
-  list<Owned<health::HealthChecker>> checkers; // Health checkers.
+  hashmap<TaskID, Owned<health::HealthChecker>> checkers; // Health checkers.
 };
 
 } // namespace internal {


[04/11] mesos git commit: Used `Duration::create()` for double -> Duration conversion.

Posted by al...@apache.org.
Used `Duration::create()` for double -> Duration conversion.

Additionally persist health check parameters from the `HealthCheck`
protobuf as class members to avoid code duplication.

Review: https://reviews.apache.org/r/52867/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/3da73971
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/3da73971
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/3da73971

Branch: refs/heads/master
Commit: 3da73971955bce20a93d2a12866ab3dd8f2d474e
Parents: bbe3b64
Author: Alexander Rukletsov <ru...@gmail.com>
Authored: Fri Nov 25 16:13:45 2016 +0100
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Nov 25 16:13:45 2016 +0100

----------------------------------------------------------------------
 src/health-check/health_checker.cpp | 62 ++++++++++++++++++++------------
 src/health-check/health_checker.hpp |  5 +++
 2 files changed, 45 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/3da73971/src/health-check/health_checker.cpp
----------------------------------------------------------------------
diff --git a/src/health-check/health_checker.cpp b/src/health-check/health_checker.cpp
index c118ca5..e66c9df 100644
--- a/src/health-check/health_checker.cpp
+++ b/src/health-check/health_checker.cpp
@@ -175,6 +175,22 @@ HealthCheckerProcess::HealthCheckerProcess(
     namespaces(_namespaces),
     consecutiveFailures(0)
 {
+  Try<Duration> create = Duration::create(check.delay_seconds());
+  CHECK_SOME(create);
+  checkDelay = create.get();
+
+  create = Duration::create(check.interval_seconds());
+  CHECK_SOME(create);
+  checkInterval = create.get();
+
+  create = Duration::create(check.grace_period_seconds());
+  CHECK_SOME(create);
+  checkGracePeriod = create.get();
+
+  create = Duration::create(check.timeout_seconds());
+  CHECK_SOME(create);
+  checkTimeout = create.get();
+
 #ifdef __linux__
   if (!namespaces.empty()) {
     clone = lambda::bind(&cloneWithSetns, lambda::_1, taskPid, namespaces);
@@ -185,24 +201,22 @@ HealthCheckerProcess::HealthCheckerProcess(
 
 void HealthCheckerProcess::initialize()
 {
-  VLOG(1) << "Health check starts in "
-          << Seconds(static_cast<int64_t>(check.delay_seconds()))
-          << ", grace period "
-          << Seconds(static_cast<int64_t>(check.grace_period_seconds()));
+  VLOG(1) << "Health check starts in " << checkDelay
+          << ", grace period " << checkGracePeriod;
 
   startTime = Clock::now();
 
-  scheduleNext(Seconds(static_cast<int64_t>(check.delay_seconds())));
+  scheduleNext(checkDelay);
 }
 
 
 void HealthCheckerProcess::failure(const string& message)
 {
   if (initializing &&
-      check.grace_period_seconds() > 0 &&
-      (Clock::now() - startTime).secs() <= check.grace_period_seconds()) {
+      checkGracePeriod.secs() > 0 &&
+      (Clock::now() - startTime) <= checkGracePeriod) {
     LOG(INFO) << "Ignoring failure as health check still in grace period";
-    scheduleNext(Seconds(static_cast<int64_t>(check.interval_seconds())));
+    scheduleNext(checkInterval);
     return;
   }
 
@@ -226,7 +240,7 @@ void HealthCheckerProcess::failure(const string& message)
   // Even if we set the `kill_task` flag, it is an executor who kills the task
   // and honors the flag (or not). We have no control over the task's lifetime,
   // hence we should continue until we are explicitly asked to stop.
-  scheduleNext(Seconds(static_cast<int64_t>(check.interval_seconds())));
+  scheduleNext(checkInterval);
 }
 
 
@@ -245,7 +259,7 @@ void HealthCheckerProcess::success()
   }
 
   consecutiveFailures = 0;
-  scheduleNext(Seconds(static_cast<int64_t>(check.interval_seconds())));
+  scheduleNext(checkInterval);
 }
 
 
@@ -347,10 +361,12 @@ Future<Nothing> HealthCheckerProcess::commandHealthCheck()
   }
 
   pid_t commandPid = external->pid();
-  Duration timeout = Seconds(static_cast<int64_t>(check.timeout_seconds()));
+  const Duration timeout = checkTimeout;
 
   return external->status()
-    .after(timeout, [timeout, commandPid](Future<Option<int>> future) {
+    .after(
+        timeout,
+        [timeout, commandPid](Future<Option<int>> future) {
       future.discard();
 
       if (commandPid != -1) {
@@ -421,16 +437,17 @@ Future<Nothing> HealthCheckerProcess::httpHealthCheck()
   }
 
   pid_t curlPid = s->pid();
-  Duration timeout = Seconds(static_cast<int64_t>(check.timeout_seconds()));
+  const Duration timeout = checkTimeout;
 
   return await(
       s->status(),
       process::io::read(s->out().get()),
       process::io::read(s->err().get()))
-    .after(timeout,
-      [timeout, curlPid](Future<tuple<Future<Option<int>>,
-                                      Future<string>,
-                                      Future<string>>> future) {
+    .after(
+        timeout,
+        [timeout, curlPid](Future<tuple<Future<Option<int>>,
+                                        Future<string>,
+                                        Future<string>>> future) {
       future.discard();
 
       if (curlPid != -1) {
@@ -544,16 +561,17 @@ Future<Nothing> HealthCheckerProcess::tcpHealthCheck()
   }
 
   pid_t tcpConnectPid = s->pid();
-  Duration timeout = Seconds(static_cast<int64_t>(check.timeout_seconds()));
+  const Duration timeout = checkTimeout;
 
   return await(
       s->status(),
       process::io::read(s->out().get()),
       process::io::read(s->err().get()))
-    .after(timeout,
-      [timeout, tcpConnectPid](Future<tuple<Future<Option<int>>,
-                                            Future<string>,
-                                            Future<string>>> future) {
+    .after(
+        timeout,
+        [timeout, tcpConnectPid](Future<tuple<Future<Option<int>>,
+                                              Future<string>,
+                                              Future<string>>> future) {
       future.discard();
 
       if (tcpConnectPid != -1) {

http://git-wip-us.apache.org/repos/asf/mesos/blob/3da73971/src/health-check/health_checker.hpp
----------------------------------------------------------------------
diff --git a/src/health-check/health_checker.hpp b/src/health-check/health_checker.hpp
index 31ed744..733d2bd 100644
--- a/src/health-check/health_checker.hpp
+++ b/src/health-check/health_checker.hpp
@@ -121,6 +121,11 @@ private:
   void scheduleNext(const Duration& duration);
 
   HealthCheck check;
+  Duration checkDelay;
+  Duration checkInterval;
+  Duration checkGracePeriod;
+  Duration checkTimeout;
+
   std::string launcherDir;
   bool initializing;
   process::UPID executor;


[06/11] mesos git commit: Ensured command executor stops health checking terminated tasks.

Posted by al...@apache.org.
Ensured command executor stops health checking terminated tasks.

We stop health checking both when the task is reaped and
killed since these may be two different execution paths.

Review: https://reviews.apache.org/r/52869/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/2edfcd34
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/2edfcd34
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/2edfcd34

Branch: refs/heads/master
Commit: 2edfcd34493dbac81c57513fd50197eb10834389
Parents: bd6186d
Author: Alexander Rukletsov <ru...@gmail.com>
Authored: Fri Nov 25 16:14:05 2016 +0100
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Nov 25 16:14:05 2016 +0100

----------------------------------------------------------------------
 src/launcher/executor.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/2edfcd34/src/launcher/executor.cpp
----------------------------------------------------------------------
diff --git a/src/launcher/executor.cpp b/src/launcher/executor.cpp
index ec1f7a0..38fe4fb 100644
--- a/src/launcher/executor.cpp
+++ b/src/launcher/executor.cpp
@@ -304,7 +304,7 @@ protected:
   {
     // This check prevents us from sending `TASK_RUNNING` updates
     // after the task has been transitioned to `TASK_KILLING`.
-    if (killed) {
+    if (killed || terminated) {
       return;
     }
 
@@ -594,6 +594,11 @@ private:
         update(taskId.get(), TASK_KILLING);
       }
 
+      // Stop health checking the task.
+      if (checker.get() != nullptr) {
+        checker->stop();
+      }
+
       // Now perform signal escalation to begin killing the task.
       CHECK_GT(pid, 0);
 
@@ -629,6 +634,11 @@ private:
   {
     terminated = true;
 
+    // Stop health checking the task.
+    if (checker.get() != nullptr) {
+      checker->stop();
+    }
+
     TaskState taskState;
     string message;
 


[11/11] mesos git commit: Printed complete health check configuration on task launch.

Posted by al...@apache.org.
Printed complete health check configuration on task launch.

Review: https://reviews.apache.org/r/54007/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/08da1ee8
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/08da1ee8
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/08da1ee8

Branch: refs/heads/master
Commit: 08da1ee8a45f4241d49a555c3b1adcc24ec1cc68
Parents: 906753b
Author: Alexander Rukletsov <ru...@gmail.com>
Authored: Fri Nov 25 16:14:43 2016 +0100
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Nov 25 16:14:43 2016 +0100

----------------------------------------------------------------------
 src/health-check/health_checker.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/08da1ee8/src/health-check/health_checker.cpp
----------------------------------------------------------------------
diff --git a/src/health-check/health_checker.cpp b/src/health-check/health_checker.cpp
index 544c1d2..c0d150a 100644
--- a/src/health-check/health_checker.cpp
+++ b/src/health-check/health_checker.cpp
@@ -37,6 +37,8 @@
 #include <process/subprocess.hpp>
 
 #include <stout/duration.hpp>
+#include <stout/json.hpp>
+#include <stout/jsonify.hpp>
 #include <stout/option.hpp>
 #include <stout/os.hpp>
 #include <stout/path.hpp>
@@ -208,8 +210,8 @@ HealthCheckerProcess::HealthCheckerProcess(
 
 void HealthCheckerProcess::initialize()
 {
-  VLOG(1) << "Health check starts in " << checkDelay
-          << ", grace period " << checkGracePeriod;
+  VLOG(1) << "Health check configuration:"
+          << " '" << jsonify(JSON::Protobuf(check)) << "'";
 
   startTime = Clock::now();
 


[10/11] mesos git commit: Cleaned up private members in HealthChecker class.

Posted by al...@apache.org.
Cleaned up private members in HealthChecker class.

Review: https://reviews.apache.org/r/52873/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/906753b0
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/906753b0
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/906753b0

Branch: refs/heads/master
Commit: 906753b0ecc2afb156e4006b147f08ec7de3695b
Parents: 14b5e94
Author: Alexander Rukletsov <ru...@gmail.com>
Authored: Fri Nov 25 16:14:35 2016 +0100
Committer: Alexander Rukletsov <al...@apache.org>
Committed: Fri Nov 25 16:14:35 2016 +0100

----------------------------------------------------------------------
 src/health-check/health_checker.cpp |  6 +++---
 src/health-check/health_checker.hpp | 15 +++++++++------
 2 files changed, 12 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/906753b0/src/health-check/health_checker.cpp
----------------------------------------------------------------------
diff --git a/src/health-check/health_checker.cpp b/src/health-check/health_checker.cpp
index 7cdf5df..544c1d2 100644
--- a/src/health-check/health_checker.cpp
+++ b/src/health-check/health_checker.cpp
@@ -174,13 +174,13 @@ HealthCheckerProcess::HealthCheckerProcess(
     const vector<string>& _namespaces)
   : ProcessBase(process::ID::generate("health-checker")),
     check(_check),
-    healthUpdateCallback(_callback),
     launcherDir(_launcherDir),
-    initializing(true),
+    healthUpdateCallback(_callback),
     taskID(_taskID),
     taskPid(_taskPid),
     namespaces(_namespaces),
-    consecutiveFailures(0)
+    consecutiveFailures(0),
+    initializing(true)
 {
   Try<Duration> create = Duration::create(check.delay_seconds());
   CHECK_SOME(create);

http://git-wip-us.apache.org/repos/asf/mesos/blob/906753b0/src/health-check/health_checker.hpp
----------------------------------------------------------------------
diff --git a/src/health-check/health_checker.hpp b/src/health-check/health_checker.hpp
index 8d410a8..1230f96 100644
--- a/src/health-check/health_checker.hpp
+++ b/src/health-check/health_checker.hpp
@@ -135,15 +135,18 @@ private:
   Duration checkGracePeriod;
   Duration checkTimeout;
 
-  lambda::function<void(const TaskHealthStatus&)> healthUpdateCallback;
-  std::string launcherDir;
-  bool initializing;
-  TaskID taskID;
-  Option<pid_t> taskPid;
-  std::vector<std::string> namespaces;
+  // Contains a binary for TCP health checks.
+  const std::string launcherDir;
+
+  const lambda::function<void(const TaskHealthStatus&)> healthUpdateCallback;
+  const TaskID taskID;
+  const Option<pid_t> taskPid;
+  const std::vector<std::string> namespaces;
   Option<lambda::function<pid_t(const lambda::function<int()>&)>> clone;
+
   uint32_t consecutiveFailures;
   process::Time startTime;
+  bool initializing;
 };