You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by gi...@apache.org on 2018/03/07 09:14:55 UTC

[01/12] mesos git commit: Updated discard handling for Docker 'stop' and 'pull' commands.

Repository: mesos
Updated Branches:
  refs/heads/1.4.x 2b97ba53b -> 84c97007c


Updated discard handling for Docker 'stop' and 'pull' commands.

Review: https://reviews.apache.org/r/65787/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/12d12e42
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/12d12e42
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/12d12e42

Branch: refs/heads/1.4.x
Commit: 12d12e4275f7b95917ac6f433b7e6a4f95b34858
Parents: e26355f
Author: Greg Mann <gr...@mesosphere.io>
Authored: Fri Feb 23 16:41:46 2018 -0800
Committer: Gilbert Song <so...@gmail.com>
Committed: Fri Mar 2 16:55:50 2018 -0800

----------------------------------------------------------------------
 src/docker/docker.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/12d12e42/src/docker/docker.cpp
----------------------------------------------------------------------
diff --git a/src/docker/docker.cpp b/src/docker/docker.cpp
index f0dbd8d..9ab716f 100755
--- a/src/docker/docker.cpp
+++ b/src/docker/docker.cpp
@@ -1143,7 +1143,8 @@ Future<Nothing> Docker::stop(
         containerName,
         cmd,
         s.get(),
-        remove));
+        remove))
+    .onDiscard(lambda::bind(&commandDiscarded, s.get(), cmd));
 }
 
 
@@ -1561,7 +1562,8 @@ Future<Docker::Image> Docker::pull(
         path,
         socket,
         config,
-        output));
+        output))
+    .onDiscard(lambda::bind(&commandDiscarded, s.get(), cmd));
 }
 
 


[08/12] mesos git commit: Fixed compile issue for backporting by removing 'AwaitSingleAbandon'.

Posted by gi...@apache.org.
Fixed compile issue for backporting by removing 'AwaitSingleAbandon'.


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/90184094
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/90184094
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/90184094

Branch: refs/heads/1.4.x
Commit: 90184094a61254b811246ef877203a91f0895942
Parents: 15c194b
Author: Gilbert Song <so...@gmail.com>
Authored: Mon Mar 5 18:09:40 2018 -0800
Committer: Gilbert Song <so...@gmail.com>
Committed: Mon Mar 5 18:11:12 2018 -0800

----------------------------------------------------------------------
 3rdparty/libprocess/src/tests/collect_tests.cpp | 35 --------------------
 1 file changed, 35 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/90184094/3rdparty/libprocess/src/tests/collect_tests.cpp
----------------------------------------------------------------------
diff --git a/3rdparty/libprocess/src/tests/collect_tests.cpp b/3rdparty/libprocess/src/tests/collect_tests.cpp
index 0dd3771..cd6f259 100644
--- a/3rdparty/libprocess/src/tests/collect_tests.cpp
+++ b/3rdparty/libprocess/src/tests/collect_tests.cpp
@@ -274,38 +274,3 @@ TEST(AwaitTest, AwaitSingleDiscard)
 
   EXPECT_TRUE(promise.future().hasDiscard());
 }
-
-
-TEST(AwaitTest, AwaitSingleAbandon)
-{
-  Owned<Promise<int>> promise(new Promise<int>());
-
-  auto bar = [&]() {
-    return promise->future();
-  };
-
-  auto foo = [&]() {
-    return await(bar())
-      .then([](const Future<int>& f) {
-        return f
-          .then([](int i) {
-            return stringify(i);
-          });
-      });
-  };
-
-  // There is a race from the time that we reset the promise to when
-  // the await process is terminated so we need to use Future::recover
-  // to properly handle this case.
-  Future<string> future = foo()
-    .recover([](const Future<string>& f) -> Future<string> {
-      if (f.isAbandoned()) {
-        return "hello";
-      }
-      return f;
-    });
-
-  promise.reset();
-
-  AWAIT_EQ("hello", future);
-}


[09/12] mesos git commit: Handled hanging docker `stop`, `inspect` commands in docker executor.

Posted by gi...@apache.org.
Handled hanging docker `stop`, `inspect` commands in docker executor.

Previosly, if `docker inspect` command hanged, the docker container
ended up in an unkillable state. This patch adds a timeout for inspect
command after receiving `killTask` analogically to `reaped` handler.
In addition we've added a timeout for `docker stop` command. If docker
`stop` or `inspect` command times out, we discard the related future,
thus the docker library kills previously spawned docker cli subprocess.
As a result, a scheduler can retry `killTask` operation to handle
nasty docker bugs that lead to hanging docker cli.

Review: https://reviews.apache.org/r/65713/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/29aa78ee
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/29aa78ee
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/29aa78ee

Branch: refs/heads/1.4.x
Commit: 29aa78ee05627768697b24bcf78f2a4f028857f9
Parents: d841ba4
Author: Andrei Budnik <ab...@mesosphere.com>
Authored: Fri Mar 2 15:38:59 2018 -0800
Committer: Gilbert Song <so...@gmail.com>
Committed: Mon Mar 5 18:11:12 2018 -0800

----------------------------------------------------------------------
 src/docker/executor.cpp | 68 ++++++++++++++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/29aa78ee/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index e580a48..5df8707 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -96,7 +96,6 @@ public:
       killed(false),
       terminated(false),
       killedByHealthCheck(false),
-      killingInProgress(false),
       launcherDir(launcherDir),
       docker(docker),
       containerName(containerName),
@@ -298,6 +297,13 @@ public:
         return Nothing();
       }));
 
+    inspect
+      .after(DOCKER_INSPECT_TIMEOUT, [=](const Future<Nothing>&) {
+        LOG(WARNING) << "Docker inspect has not finished after "
+                     << DOCKER_INSPECT_TIMEOUT;
+        return inspect;
+      });
+
     inspect.onFailed(defer(self(), [=](const string& failure) {
       LOG(ERROR) << "Failed to inspect container '" << containerName << "'"
                  << ": " << failure;
@@ -417,9 +423,8 @@ private:
     // TODO(alexr): If a kill is in progress, consider adjusting
     // the grace period if a new one is provided.
 
-    // Issue the kill signal if the container is running
-    // and kill attempt is not in progress.
-    if (run.isSome() && !killingInProgress) {
+    // Issue the kill signal if there was an attempt to launch the container.
+    if (run.isSome()) {
       // We have to issue the kill after 'docker inspect' has
       // completed, otherwise we may race with 'docker run'
       // and docker may not know about the container. Note
@@ -428,6 +433,15 @@ private:
       // issued the kill.
       inspect
         .onAny(defer(self(), &Self::_killTask, _taskId, gracePeriod));
+
+      // If the inspect takes too long we discard it to ensure we
+      // don't wait forever, however in this case there may be no
+      // TASK_RUNNING update.
+      inspect
+        .after(DOCKER_INSPECT_TIMEOUT, [=](const Future<Nothing>&) {
+          inspect.discard();
+          return inspect;
+        });
     }
   }
 
@@ -438,9 +452,7 @@ private:
     CHECK_SOME(taskId);
     CHECK_EQ(taskId_, taskId.get());
 
-    if (!terminated && !killingInProgress) {
-      killingInProgress = true;
-
+    if (!terminated) {
       // Once the task has been transitioned to `killed`,
       // there is no way back, even if the kill attempt
       // failed. This also allows us to send TASK_KILLING
@@ -475,28 +487,43 @@ private:
         }
       }
 
+      // If a previous attempt to stop a Docker container is still in progress,
+      // we need to kill the hanging Docker CLI subprocess. Discarding this
+      // future triggers a callback in the Docker library that kills the
+      // subprocess.
+      if (stop.isPending()) {
+        LOG(WARNING) << "Previous docker stop has not terminated yet"
+                     << " for container '" << containerName << "'";
+        stop.discard();
+      }
+
       // TODO(bmahler): Replace this with 'docker kill' so
       // that we can adjust the grace period in the case of
       // a `KillPolicy` override.
+      //
+      // NOTE: `docker stop` may or may not finish. Our behaviour is to give
+      // the subprocess a chance to finish until next time `_killtask` is
+      // invoked. Also, invoking `docker stop` might be unsuccessful, in which
+      // case the container most probably does not receive the signal.
+      // In any case we should allow schedulers to retry the kill operation or,
+      // if the kill was initiated by a failing health check, retry ourselves.
+      // We do not bail out nor stop retrying to avoid sending a terminal
+      // status update while the container might still be running.
       stop = docker->stop(containerName, gracePeriod);
 
-      // Invoking `docker stop` might be unsuccessful, in which case the
-      // container most probably does not receive the signal. In this case we
-      // should allow schedulers to retry the kill operation or, if the kill
-      // was initiated by a failing health check, retry ourselves. We do not
-      // bail out nor stop retrying to avoid sending a terminal status update
-      // while the container might still be running.
-      //
-      // NOTE: `docker stop` might also hang. We do not address this for now,
-      // because there is no evidence that in this case docker daemon might
-      // function properly, i.e., it's only the docker cli command that hangs,
-      // and hence there is not so much we can do. See MESOS-6743.
+      if (killedByHealthCheck) {
+        stop
+          .after(KILL_RETRY_INTERVAL, defer(self(), [=](Future<Nothing>) {
+            LOG(INFO) << "Retrying to kill task";
+            _killTask(taskId_, gracePeriod);
+            return stop;
+          }));
+      }
+
       stop.onFailed(defer(self(), [=](const string& failure) {
         LOG(ERROR) << "Failed to stop container '" << containerName << "'"
                    << ": " << failure;
 
-        killingInProgress = false;
-
         if (killedByHealthCheck) {
           LOG(INFO) << "Retrying to kill task in " << KILL_RETRY_INTERVAL;
           delay(
@@ -717,7 +744,6 @@ private:
   bool terminated;
 
   bool killedByHealthCheck;
-  bool killingInProgress; // Guard against simultaneous kill attempts.
 
   string launcherDir;
   Owned<Docker> docker;


[10/12] mesos git commit: Added overload of process::await that takes and returns single future.

Posted by gi...@apache.org.
Added overload of process::await that takes and returns single future.

Review: https://reviews.apache.org/r/61152


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/15c194b8
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/15c194b8
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/15c194b8

Branch: refs/heads/1.4.x
Commit: 15c194b8ff42b7cb4b1466e59a43e69ea1307098
Parents: 29aa78e
Author: Benjamin Hindman <be...@gmail.com>
Authored: Wed Dec 28 06:42:06 2016 -0800
Committer: Gilbert Song <so...@gmail.com>
Committed: Mon Mar 5 18:11:12 2018 -0800

----------------------------------------------------------------------
 3rdparty/libprocess/include/process/collect.hpp | 56 ++++++++++++++++
 3rdparty/libprocess/src/tests/collect_tests.cpp | 67 ++++++++++++++++++++
 2 files changed, 123 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/15c194b8/3rdparty/libprocess/include/process/collect.hpp
----------------------------------------------------------------------
diff --git a/3rdparty/libprocess/include/process/collect.hpp b/3rdparty/libprocess/include/process/collect.hpp
index fccf55a..43f55ca 100644
--- a/3rdparty/libprocess/include/process/collect.hpp
+++ b/3rdparty/libprocess/include/process/collect.hpp
@@ -57,6 +57,62 @@ template <typename... Ts>
 Future<std::tuple<Future<Ts>...>> await(const Future<Ts>&... futures);
 
 
+// Waits on the future specified and returns after the future has been
+// completed or the await has been discarded. This is useful when
+// wanting to "break out" of a future chain if a discard occurs but
+// the underlying future has not been discarded. For example:
+//
+//   Future<string> foo()
+//   {
+//     return bar()
+//       .then([](int i) {
+//         return stringify(i);
+//       });
+//   }
+//
+//   Future<stringify> future = foo();
+//   future.discard();
+//
+// In the above code we'll propagate the discard to `bar()` but might
+// wait forever if `bar()` can't discard their computation. In some
+// circumstances you might want to break out early and you can do that
+// by using `await`, because if we discard an `await` that function
+// will return even though all of the future's it is waiting on have
+// not been discarded (in other words, the `await` can be properly
+// discarded even if the underlying futures have not been discarded).
+//
+//   Future<string> foo()
+//   {
+//     return await(bar())
+//       .recover([](const Future<Future<string>>& future) {
+//         if (future.isDiscarded()) {
+//           cleanup();
+//         }
+//         return Failure("Discarded waiting");
+//       })
+//       .then([](const Future<int>& future) {
+//         return future
+//           .then([](int i) {
+//             return stringify(i);
+//           });
+//       });
+//   }
+//
+//   Future<string> future = foo();
+//   future.discard();
+//
+// Using `await` will enable you to continue execution even if `bar()`
+// does not (or can not) discard their execution.
+template <typename T>
+Future<Future<T>> await(const Future<T>& future)
+{
+  return await(std::list<Future<T>>{future})
+    .then([=]() {
+      return Future<Future<T>>(future);
+    });
+}
+
+
 namespace internal {
 
 template <typename T>

http://git-wip-us.apache.org/repos/asf/mesos/blob/15c194b8/3rdparty/libprocess/src/tests/collect_tests.cpp
----------------------------------------------------------------------
diff --git a/3rdparty/libprocess/src/tests/collect_tests.cpp b/3rdparty/libprocess/src/tests/collect_tests.cpp
index 155e0bb..0dd3771 100644
--- a/3rdparty/libprocess/src/tests/collect_tests.cpp
+++ b/3rdparty/libprocess/src/tests/collect_tests.cpp
@@ -10,15 +10,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License
 
+#include <string>
+
 #include <process/collect.hpp>
 #include <process/gtest.hpp>
 
 #include <stout/gtest.hpp>
+#include <stout/stringify.hpp>
 
 using process::Future;
 using process::Promise;
 
 using std::list;
+using std::string;
 
 TEST(CollectTest, Ready)
 {
@@ -242,3 +246,66 @@ TEST(AwaitTest, DiscardPropagation)
   AWAIT_DISCARDED(future1);
   AWAIT_DISCARDED(future2);
 }
+
+
+TEST(AwaitTest, AwaitSingleDiscard)
+{
+  Promise<int> promise;
+
+  auto bar = [&]() {
+    return promise.future();
+  };
+
+  auto foo = [&]() {
+    return await(bar())
+      .then([](const Future<int>& f) {
+        return f
+          .then([](int i) {
+            return stringify(i);
+          });
+      });
+  };
+
+  Future<string> future = foo();
+
+  future.discard();
+
+  AWAIT_DISCARDED(future);
+
+  EXPECT_TRUE(promise.future().hasDiscard());
+}
+
+
+TEST(AwaitTest, AwaitSingleAbandon)
+{
+  Owned<Promise<int>> promise(new Promise<int>());
+
+  auto bar = [&]() {
+    return promise->future();
+  };
+
+  auto foo = [&]() {
+    return await(bar())
+      .then([](const Future<int>& f) {
+        return f
+          .then([](int i) {
+            return stringify(i);
+          });
+      });
+  };
+
+  // There is a race from the time that we reset the promise to when
+  // the await process is terminated so we need to use Future::recover
+  // to properly handle this case.
+  Future<string> future = foo()
+    .recover([](const Future<string>& f) -> Future<string> {
+      if (f.isAbandoned()) {
+        return "hello";
+      }
+      return f;
+    });
+
+  promise.reset();
+
+  AWAIT_EQ("hello", future);
+}


[02/12] mesos git commit: Reaped the container process directly in Docker executor.

Posted by gi...@apache.org.
Reaped the container process directly in Docker executor.

Due to a Docker issue (https://github.com/moby/moby/issues/33820),
Docker daemon can fail to catch a container exit, i.e., the container
process has already exited but the command `docker ps` shows the
container still running, this will lead to the "docker run" command
that we execute in Docker executor never returning, and it will also
cause the `docker stop` command takes no effect, i.e., it will return
without error but `docker ps` shows the container still running, so
the task will stuck in `TASK_KILLING` state.

To workaround this Docker issue, in this patch we made Docker executor
reaps the container process directly so Docker executor will be notified
once the container process exits.

Review: https://reviews.apache.org/r/65518


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/4faa7f48
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/4faa7f48
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/4faa7f48

Branch: refs/heads/1.4.x
Commit: 4faa7f485e56646de9236062919c24ba1af0ea89
Parents: 2b97ba5
Author: Qian Zhang <zh...@gmail.com>
Authored: Mon Feb 5 20:42:07 2018 +0800
Committer: Gilbert Song <so...@gmail.com>
Committed: Fri Mar 2 16:55:50 2018 -0800

----------------------------------------------------------------------
 src/docker/executor.cpp | 60 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/4faa7f48/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index 7a50e66..2c538ce 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -246,6 +246,47 @@ public:
           driver->sendStatusUpdate(status);
         }
 
+        // This is a workaround for the Docker issue below:
+        // https://github.com/moby/moby/issues/33820
+        // Due to this issue, Docker daemon can fail to catch a container exit,
+        // which will lead to the `docker run` command that we execute in this
+        // executor never returning although the container has already exited.
+        // To workaround this issue, here we reap the container process directly
+        // so we will be notified when the container exits.
+        if (container.pid.isSome()) {
+          process::reap(container.pid.get())
+            .then(defer(self(), [=](const Option<int>& status) {
+              // We cannot get the actual exit status of the container
+              // process since it is not a child process of this executor,
+              // so here `status` must be `None`.
+              CHECK_NONE(status);
+
+              // There will be a race between the method `reaped` and this
+              // lambda; ideally when the Docker issue mentioned above does
+              // not occur, `reaped` will be invoked (i.e., the `docker run`
+              // command returns) to get the actual exit status of the
+              // container, so here we wait a few seconds for `reaped` to be
+              // invoked. If `reaped` is not invoked within the timeout, that
+              // means we hit that Docker issue.
+              delay(
+                  Seconds(3),
+                  self(),
+                  &Self::reapedContainer,
+                  container.pid.get());
+
+              return Nothing();
+            }));
+        } else {
+          // This is the case that the container process has already exited,
+          // Similar to the above case, let's wait a few seconds for `reaped`
+          // to be invoked.
+          delay(
+              Seconds(3),
+              self(),
+              &Self::reapedContainer,
+              None());
+        }
+
         return Nothing();
       }));
 
@@ -461,8 +502,27 @@ private:
     }
   }
 
+  void reapedContainer(Option<pid_t> pid)
+  {
+    // Do nothing if the method `reaped` has already been invoked.
+    if (terminated) {
+      return;
+    }
+
+    // This means the Docker issue mentioned in `launchTask` occurs.
+    LOG(WARNING) << "The container process"
+                 << (pid.isSome() ? " (pid: " + stringify(pid.get()) + ")" : "")
+                 << " has exited, but Docker daemon failed to catch it.";
+
+    reaped(None());
+  }
+
   void reaped(const Future<Option<int>>& run)
   {
+    if (terminated) {
+      return;
+    }
+
     terminated = true;
 
     // Stop health checking the task.


[11/12] mesos git commit: Added inspect retries to the Docker executor.

Posted by gi...@apache.org.
Added inspect retries to the Docker executor.

This patch adds retries for `inspect` command to workaround docker
daemon hangs. We assume that the docker daemon can be temporarily
unresponsive. If it's unresponsive, then any started docker cli
command hangs. To address the issue, we retry `inspect` in the loop.

Review: https://reviews.apache.org/r/65759/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/513c8dd2
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/513c8dd2
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/513c8dd2

Branch: refs/heads/1.4.x
Commit: 513c8dd2c18911ec1090a67193faf1d28a1b2a1f
Parents: 9018409
Author: Andrei Budnik <ab...@mesosphere.com>
Authored: Fri Mar 2 15:39:05 2018 -0800
Committer: Gilbert Song <so...@gmail.com>
Committed: Mon Mar 5 18:11:12 2018 -0800

----------------------------------------------------------------------
 src/docker/executor.cpp | 46 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 37 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/513c8dd2/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index 5df8707..4b5f257 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -22,8 +22,10 @@
 #include <mesos/executor.hpp>
 #include <mesos/mesos.hpp>
 
+#include <process/collect.hpp>
 #include <process/delay.hpp>
 #include <process/id.hpp>
+#include <process/loop.hpp>
 #include <process/owned.hpp>
 #include <process/process.hpp>
 #include <process/protobuf.hpp>
@@ -204,13 +206,46 @@ public:
 
     run->onAny(defer(self(), &Self::reaped, lambda::_1));
 
+    // Since the Docker daemon might hang, we have to retry the inspect command.
+    auto inspectLoop = loop(
+        self(),
+        [=]() {
+          return await(
+              docker->inspect(containerName, DOCKER_INSPECT_DELAY)
+                .after(
+                    DOCKER_INSPECT_TIMEOUT,
+                    [=](Future<Docker::Container> future) {
+                      LOG(WARNING) << "Docker inspect timed out after "
+                                   << DOCKER_INSPECT_TIMEOUT
+                                   << " for container "
+                                   << "'" << containerName << "'";
+
+                      // We need to clean up the hanging Docker CLI process.
+                      // Discarding the inspect future triggers a callback in
+                      // the Docker library that kills the subprocess and
+                      // transitions the future.
+                      future.discard();
+                      return future;
+                    }));
+        },
+        [](const Future<Docker::Container>& future)
+            -> Future<ControlFlow<Docker::Container>> {
+          if (future.isReady()) {
+            return Break(future.get());
+          }
+          if (future.isFailed()) {
+            return Failure(future.failure());
+          }
+          return Continue();
+        });
+
     // Delay sending TASK_RUNNING status update until we receive
     // inspect output. Note that we store a future that completes
     // after the sending of the running update. This allows us to
     // ensure that the terminal update is sent after the running
     // update (see `reaped()`).
-    inspect = docker->inspect(containerName, DOCKER_INSPECT_DELAY)
-      .then(defer(self(), [=](const Docker::Container& container) {
+    inspect =
+      inspectLoop.then(defer(self(), [=](const Docker::Container& container) {
         if (!killed) {
           containerPid = container.pid;
 
@@ -297,13 +332,6 @@ public:
         return Nothing();
       }));
 
-    inspect
-      .after(DOCKER_INSPECT_TIMEOUT, [=](const Future<Nothing>&) {
-        LOG(WARNING) << "Docker inspect has not finished after "
-                     << DOCKER_INSPECT_TIMEOUT;
-        return inspect;
-      });
-
     inspect.onFailed(defer(self(), [=](const string& failure) {
       LOG(ERROR) << "Failed to inspect container '" << containerName << "'"
                  << ": " << failure;


[07/12] mesos git commit: Ensured that Docker containerizer returns a failed Future in one case.

Posted by gi...@apache.org.
Ensured that Docker containerizer returns a failed Future in one case.

Because the Docker library did not immediately transition the
Future returned by `inspect()` to DISCARDED, it was safe for the
Docker containerizer to discard this Future before failing the
Promise associated with the Future returned by `launch()`.

However, the introduction of an `onDiscard` callback in
`Docker::inspect()` makes this assumption invalid. This patch
addresses this by failing the Promise before discarding the
Future.

Review: https://reviews.apache.org/r/65743/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/57cd3c34
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/57cd3c34
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/57cd3c34

Branch: refs/heads/1.4.x
Commit: 57cd3c3467d3c6923fcc22dd3b491da3f5dd8ad8
Parents: 12d12e4
Author: Greg Mann <gr...@mesosphere.io>
Authored: Tue Feb 27 16:42:10 2018 -0800
Committer: Gilbert Song <so...@gmail.com>
Committed: Mon Mar 5 18:11:12 2018 -0800

----------------------------------------------------------------------
 src/slave/containerizer/docker.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/57cd3c34/src/slave/containerizer/docker.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/docker.cpp b/src/slave/containerizer/docker.cpp
index 4db76df..e261a42 100644
--- a/src/slave/containerizer/docker.cpp
+++ b/src/slave/containerizer/docker.cpp
@@ -1427,15 +1427,15 @@ Future<Docker::Container> DockerContainerizerProcess::launchExecutorContainer(
 
     run.onAny([=]() mutable {
       if (!run.isReady()) {
-        inspect.discard();
         promise->fail(run.isFailed() ? run.failure() : "discarded");
-      } else if (run->isNone()) {
         inspect.discard();
+      } else if (run->isNone()) {
         promise->fail("Failed to obtain exit status of container");
+        inspect.discard();
       } else {
         if (!WSUCCEEDED(run->get())) {
-          inspect.discard();
           promise->fail("Container " + WSTRINGIFY(run->get()));
+          inspect.discard();
         }
 
         // TODO(bmahler): Handle the case where the 'run' exits


[05/12] mesos git commit: Updated discard handling in `Docker::inspect()`.

Posted by gi...@apache.org.
Updated discard handling in `Docker::inspect()`.

Previously, discards of the `Future` returned by `Docker::inspect()`
were only handled at the beginning of each asynchronous continuation
in the library function's call chain. This meant that if a Docker
CLI command became stuck in between async calls, discarding the
`Future` would have no effect.

This patch adds an `onDiscard` callback to the `Future` to ensure
that any discards have the desired effect: cleanup of any spawned
subprocess, and a transition of the `Future` to the discarded state.

Since the Docker library is not a libprocess process, we must
implement this with a `shared_ptr` and a mutex, to protect against
concurrent access to the `onDiscard` callback, which must be updated
when retries are performed.

Review: https://reviews.apache.org/r/65683/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/155b3afe
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/155b3afe
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/155b3afe

Branch: refs/heads/1.4.x
Commit: 155b3afec7144baca498036a39ec954b38dbbfbb
Parents: 57cd3c3
Author: Greg Mann <gr...@mesosphere.io>
Authored: Wed Feb 28 15:24:17 2018 -0800
Committer: Gilbert Song <so...@gmail.com>
Committed: Mon Mar 5 18:11:12 2018 -0800

----------------------------------------------------------------------
 src/docker/docker.cpp | 51 ++++++++++++++++++++++++++++++++++------------
 src/docker/docker.hpp | 14 ++++++++++---
 2 files changed, 49 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/155b3afe/src/docker/docker.cpp
----------------------------------------------------------------------
diff --git a/src/docker/docker.cpp b/src/docker/docker.cpp
index 9ab716f..e9e600e 100755
--- a/src/docker/docker.cpp
+++ b/src/docker/docker.cpp
@@ -15,6 +15,8 @@
 // limitations under the License.
 
 #include <map>
+#include <mutex>
+#include <utility>
 #include <vector>
 
 #include <stout/error.hpp>
@@ -56,6 +58,9 @@ using namespace process;
 
 using std::list;
 using std::map;
+using std::mutex;
+using std::pair;
+using std::shared_ptr;
 using std::string;
 using std::vector;
 
@@ -1226,20 +1231,29 @@ Future<Docker::Container> Docker::inspect(
 {
   Owned<Promise<Docker::Container>> promise(new Promise<Docker::Container>());
 
+  // Holds a callback used for cleanup in case this call to 'docker inspect' is
+  // discarded, and a mutex to control access to the callback.
+  auto callback = std::make_shared<pair<lambda::function<void()>, mutex>>();
+
   const string cmd = path + " -H " + socket + " inspect " + containerName;
-  _inspect(cmd, promise, retryInterval);
+  _inspect(cmd, promise, retryInterval, callback);
 
-  return promise->future();
+  return promise->future()
+    .onDiscard([callback]() {
+      synchronized (callback->second) {
+        callback->first();
+      }
+    });
 }
 
 
 void Docker::_inspect(
     const string& cmd,
     const Owned<Promise<Docker::Container>>& promise,
-    const Option<Duration>& retryInterval)
+    const Option<Duration>& retryInterval,
+    shared_ptr<pair<lambda::function<void()>, mutex>> callback)
 {
   if (promise->future().hasDiscard()) {
-    promise->discard();
     return;
   }
 
@@ -1256,13 +1270,25 @@ void Docker::_inspect(
     return;
   }
 
+  // Set the `onDiscard` callback which will clean up the subprocess if the
+  // caller discards the `Future` that we returned.
+  synchronized (callback->second) {
+    callback->first = [promise, s, cmd]() {
+      promise->discard();
+      CHECK_SOME(s);
+      commandDiscarded(s.get(), cmd);
+    };
+  }
+
   // Start reading from stdout so writing to the pipe won't block
   // to handle cases where the output is larger than the pipe
   // capacity.
   const Future<string> output = io::read(s.get().out().get());
 
   s.get().status()
-    .onAny([=]() { __inspect(cmd, promise, retryInterval, output, s.get()); });
+    .onAny([=]() {
+      __inspect(cmd, promise, retryInterval, output, s.get(), callback);
+    });
 }
 
 
@@ -1271,11 +1297,10 @@ void Docker::__inspect(
     const Owned<Promise<Docker::Container>>& promise,
     const Option<Duration>& retryInterval,
     Future<string> output,
-    const Subprocess& s)
+    const Subprocess& s,
+    shared_ptr<pair<lambda::function<void()>, mutex>> callback)
 {
   if (promise->future().hasDiscard()) {
-    promise->discard();
-    output.discard();
     return;
   }
 
@@ -1293,7 +1318,7 @@ void Docker::__inspect(
       VLOG(1) << "Retrying inspect with non-zero status code. cmd: '"
               << cmd << "', interval: " << stringify(retryInterval.get());
       Clock::timer(retryInterval.get(),
-                   [=]() { _inspect(cmd, promise, retryInterval); } );
+                   [=]() { _inspect(cmd, promise, retryInterval, callback); });
       return;
     }
 
@@ -1315,7 +1340,7 @@ void Docker::__inspect(
   CHECK_SOME(s.out());
   output
     .onAny([=](const Future<string>& output) {
-      ___inspect(cmd, promise, retryInterval, output);
+      ___inspect(cmd, promise, retryInterval, output, callback);
     });
 }
 
@@ -1324,10 +1349,10 @@ void Docker::___inspect(
     const string& cmd,
     const Owned<Promise<Docker::Container>>& promise,
     const Option<Duration>& retryInterval,
-    const Future<string>& output)
+    const Future<string>& output,
+    shared_ptr<pair<lambda::function<void()>, mutex>> callback)
 {
   if (promise->future().hasDiscard()) {
-    promise->discard();
     return;
   }
 
@@ -1348,7 +1373,7 @@ void Docker::___inspect(
     VLOG(1) << "Retrying inspect since container not yet started. cmd: '"
             << cmd << "', interval: " << stringify(retryInterval.get());
     Clock::timer(retryInterval.get(),
-                 [=]() { _inspect(cmd, promise, retryInterval); } );
+                 [=]() { _inspect(cmd, promise, retryInterval, callback); } );
     return;
   }
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/155b3afe/src/docker/docker.hpp
----------------------------------------------------------------------
diff --git a/src/docker/docker.hpp b/src/docker/docker.hpp
index 95e60a7..81c1154 100644
--- a/src/docker/docker.hpp
+++ b/src/docker/docker.hpp
@@ -19,7 +19,9 @@
 
 #include <list>
 #include <map>
+#include <mutex>
 #include <string>
+#include <utility>
 
 #include <process/future.hpp>
 #include <process/owned.hpp>
@@ -334,20 +336,26 @@ private:
   static void _inspect(
       const std::string& cmd,
       const process::Owned<process::Promise<Container>>& promise,
-      const Option<Duration>& retryInterval);
+      const Option<Duration>& retryInterval,
+      std::shared_ptr<std::pair<lambda::function<void()>, std::mutex>>
+        callback);
 
   static void __inspect(
       const std::string& cmd,
       const process::Owned<process::Promise<Container>>& promise,
       const Option<Duration>& retryInterval,
       process::Future<std::string> output,
-      const process::Subprocess& s);
+      const process::Subprocess& s,
+      std::shared_ptr<std::pair<lambda::function<void()>, std::mutex>>
+        callback);
 
   static void ___inspect(
       const std::string& cmd,
       const process::Owned<process::Promise<Container>>& promise,
       const Option<Duration>& retryInterval,
-      const process::Future<std::string>& output);
+      const process::Future<std::string>& output,
+      std::shared_ptr<std::pair<lambda::function<void()>, std::mutex>>
+        callback);
 
   static process::Future<std::list<Container>> _ps(
       const Docker& docker,


[04/12] mesos git commit: Windows: Fixed flaky Docker command health check test.

Posted by gi...@apache.org.
Windows: Fixed flaky Docker command health check test.

The `DockerContainerizerHealthCheckTest.ROOT_DOCKER_
DockerHealthStatusChange` test was flaky on Windows, because
the Docker executor manually reaps the container exit code in
case that `docker run` fails to get the exit code. This logic
doesn't work on Windows, since the process might not be visible to
the container host machine, causing `TASK_FAILED` to get sent. By
removing the reaping logic on Windows, the test is much more reliable.

Review: https://reviews.apache.org/r/65733/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/ecd0dd85
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/ecd0dd85
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/ecd0dd85

Branch: refs/heads/1.4.x
Commit: ecd0dd85e383d789104896b55c38fb9837866ab3
Parents: 4faa7f4
Author: Akash Gupta <ak...@hotmail.com>
Authored: Sun Feb 25 13:37:42 2018 -0800
Committer: Gilbert Song <so...@gmail.com>
Committed: Fri Mar 2 16:55:50 2018 -0800

----------------------------------------------------------------------
 src/docker/executor.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/ecd0dd85/src/docker/executor.cpp
----------------------------------------------------------------------
diff --git a/src/docker/executor.cpp b/src/docker/executor.cpp
index 2c538ce..e580a48 100644
--- a/src/docker/executor.cpp
+++ b/src/docker/executor.cpp
@@ -253,6 +253,13 @@ public:
         // executor never returning although the container has already exited.
         // To workaround this issue, here we reap the container process directly
         // so we will be notified when the container exits.
+        //
+        // The issue has only been reported on Linux, so it's not clear if
+        // Windows also has this issue. Regardless, we don't use this workaround
+        // for Windows, because the pid legitimately might not exist. For
+        // example, if the container is running in Hyper-V isolation, the pid
+        // will only exist in the guest OS.
+#ifndef __WINDOWS__
         if (container.pid.isSome()) {
           process::reap(container.pid.get())
             .then(defer(self(), [=](const Option<int>& status) {
@@ -286,6 +293,7 @@ public:
               &Self::reapedContainer,
               None());
         }
+#endif // __WINDOWS__
 
         return Nothing();
       }));


[06/12] mesos git commit: Avoided orphan subprocess in the Docker library.

Posted by gi...@apache.org.
Avoided orphan subprocess in the Docker library.

This patch ensures that `Docker::inspect` will not leave orphan
subprocesses behind.

Review: https://reviews.apache.org/r/65887/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/d841ba45
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/d841ba45
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/d841ba45

Branch: refs/heads/1.4.x
Commit: d841ba457926baaec6c188ec0f08a95d295af5ab
Parents: 155b3af
Author: Greg Mann <gr...@mesosphere.io>
Authored: Fri Mar 2 15:39:58 2018 -0800
Committer: Gilbert Song <so...@gmail.com>
Committed: Mon Mar 5 18:11:12 2018 -0800

----------------------------------------------------------------------
 src/docker/docker.cpp | 7 +++++++
 1 file changed, 7 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/d841ba45/src/docker/docker.cpp
----------------------------------------------------------------------
diff --git a/src/docker/docker.cpp b/src/docker/docker.cpp
index e9e600e..2a9316d 100755
--- a/src/docker/docker.cpp
+++ b/src/docker/docker.cpp
@@ -1273,6 +1273,13 @@ void Docker::_inspect(
   // Set the `onDiscard` callback which will clean up the subprocess if the
   // caller discards the `Future` that we returned.
   synchronized (callback->second) {
+    // It's possible that the caller has discarded their future while we were
+    // creating a new subprocess, so we clean up here if necessary.
+    if (promise->future().hasDiscard()) {
+      commandDiscarded(s.get(), cmd);
+      return;
+    }
+
     callback->first = [promise, s, cmd]() {
       promise->discard();
       CHECK_SOME(s);


[03/12] mesos git commit: Prevented Docker library from terminating incorrect processes.

Posted by gi...@apache.org.
Prevented Docker library from terminating incorrect processes.

Previously, the Docker library might call `os::killtree()` on a
PID after the associated subprocess had already terminated, which
could lead to an unknown process being incorrectly killed.

Review: https://reviews.apache.org/r/65786/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/e26355f9
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/e26355f9
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/e26355f9

Branch: refs/heads/1.4.x
Commit: e26355f91383e37e6e7fe6e58816692db7116303
Parents: ecd0dd8
Author: Greg Mann <gr...@mesosphere.io>
Authored: Fri Feb 23 16:41:44 2018 -0800
Committer: Gilbert Song <so...@gmail.com>
Committed: Fri Mar 2 16:55:50 2018 -0800

----------------------------------------------------------------------
 src/docker/docker.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/e26355f9/src/docker/docker.cpp
----------------------------------------------------------------------
diff --git a/src/docker/docker.cpp b/src/docker/docker.cpp
index 192e170..f0dbd8d 100755
--- a/src/docker/docker.cpp
+++ b/src/docker/docker.cpp
@@ -148,8 +148,10 @@ Try<Owned<Docker>> Docker::create(
 
 void commandDiscarded(const Subprocess& s, const string& cmd)
 {
-  VLOG(1) << "'" << cmd << "' is being discarded";
-  os::killtree(s.pid(), SIGKILL);
+  if (s.status().isPending()) {
+    VLOG(1) << "'" << cmd << "' is being discarded";
+    os::killtree(s.pid(), SIGKILL);
+  }
 }
 
 


[12/12] mesos git commit: Added inspect retries to the docker containerizer in `update` method.

Posted by gi...@apache.org.
Added inspect retries to the docker containerizer in `update` method.

This patch fixes the bug when a terminal status update is never sent
after termination of the docker executor, when the Docker daemon hangs
for `inspect` command. A terminal status update is postponed until
containerizer `update` completes that uses `inspect` command to get
a PID of the docker container. To address the issue, we retry `inspect`
in the loop.

Review: https://reviews.apache.org/r/65868/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/84c97007
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/84c97007
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/84c97007

Branch: refs/heads/1.4.x
Commit: 84c97007ca01b4e3d3851a899a38ae2e13493e5f
Parents: 513c8dd
Author: Andrei Budnik <ab...@mesosphere.com>
Authored: Fri Mar 2 15:39:09 2018 -0800
Committer: Gilbert Song <so...@gmail.com>
Committed: Mon Mar 5 18:11:13 2018 -0800

----------------------------------------------------------------------
 src/slave/constants.hpp            |  3 +++
 src/slave/containerizer/docker.cpp | 42 ++++++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/84c97007/src/slave/constants.hpp
----------------------------------------------------------------------
diff --git a/src/slave/constants.hpp b/src/slave/constants.hpp
index fac831c..71837e7 100644
--- a/src/slave/constants.hpp
+++ b/src/slave/constants.hpp
@@ -133,6 +133,9 @@ constexpr Duration DOCKER_REMOVE_DELAY = Hours(6);
 // container.
 constexpr Duration DOCKER_INSPECT_DELAY = Seconds(1);
 
+// Default duration to wait for `inspect` command completion.
+constexpr Duration DOCKER_INSPECT_TIMEOUT = Seconds(5);
+
 // Default maximum number of docker inspect calls docker ps will invoke
 // in parallel to prevent hitting system's open file descriptor limit.
 constexpr size_t DOCKER_PS_MAX_INSPECT_CALLS = 100;

http://git-wip-us.apache.org/repos/asf/mesos/blob/84c97007/src/slave/containerizer/docker.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/docker.cpp b/src/slave/containerizer/docker.cpp
index e261a42..79d1caa 100644
--- a/src/slave/containerizer/docker.cpp
+++ b/src/slave/containerizer/docker.cpp
@@ -1681,7 +1681,47 @@ Future<Nothing> DockerContainerizerProcess::update(
     return __update(containerId, _resources, container->pid.get());
   }
 
-  return docker->inspect(containers_.at(containerId)->containerName)
+  string containerName = containers_.at(containerId)->containerName;
+
+  // Since the Docker daemon might hang, we have to retry the inspect command.
+  //
+  // NOTE: This code is duplicated from the built-in docker executor, but
+  // the retry interval is not passed to `inspect`, because the container might
+  // be terminated.
+  // TODO(abudnik): Consider using a class helper for retrying docker commands.
+  auto inspectLoop = loop(
+      self(),
+      [=]() {
+        return await(
+            docker->inspect(containerName)
+              .after(
+                  slave::DOCKER_INSPECT_TIMEOUT,
+                  [=](Future<Docker::Container> future) {
+                    LOG(WARNING) << "Docker inspect timed out after "
+                                 << slave::DOCKER_INSPECT_TIMEOUT
+                                 << " for container "
+                                 << "'" << containerName << "'";
+
+                    // We need to clean up the hanging Docker CLI process.
+                    // Discarding the inspect future triggers a callback in
+                    // the Docker library that kills the subprocess and
+                    // transitions the future.
+                    future.discard();
+                    return future;
+                  }));
+      },
+      [](const Future<Docker::Container>& future)
+          -> Future<ControlFlow<Docker::Container>> {
+        if (future.isReady()) {
+          return Break(future.get());
+        }
+        if (future.isFailed()) {
+          return Failure(future.failure());
+        }
+        return Continue();
+      });
+
+  return inspectLoop
     .then(defer(self(), &Self::_update, containerId, _resources, lambda::_1));
 #else
   return Nothing();