You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by an...@apache.org on 2016/11/23 04:47:10 UTC

[1/6] mesos git commit: Fixed Mesos containerizer to set container FETCHING state.

Repository: mesos
Updated Branches:
  refs/heads/0.28.x 95ee5a583 -> 52a0b0a41


Fixed Mesos containerizer to set container FETCHING state.

If the container state is not properly set to FETCHING, Mesos agent
cannot detect the terminated executor when the fetcher times out.

Review: https://reviews.apache.org/r/49650


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/56b4c561
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/56b4c561
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/56b4c561

Branch: refs/heads/0.28.x
Commit: 56b4c561e08a8cc36e5cbc3a786981412bf226dd
Parents: 95ee5a5
Author: Jiang Yan Xu <xu...@apple.com>
Authored: Fri Jul 1 15:27:37 2016 -0700
Committer: Anand Mazumdar <an...@apache.org>
Committed: Tue Nov 22 20:30:46 2016 -0800

----------------------------------------------------------------------
 src/slave/containerizer/mesos/containerizer.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/56b4c561/src/slave/containerizer/mesos/containerizer.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp
index 658db78..52906dd 100644
--- a/src/slave/containerizer/mesos/containerizer.cpp
+++ b/src/slave/containerizer/mesos/containerizer.cpp
@@ -945,6 +945,12 @@ Future<Nothing> MesosContainerizerProcess::fetch(
     return Failure("Container is already destroyed");
   }
 
+  if (containers_[containerId]->state == DESTROYING) {
+    return Failure("Container is currently being destroyed");
+  }
+
+  containers_[containerId]->state = FETCHING;
+
   return fetcher->fetch(
       containerId,
       commandInfo,
@@ -1493,10 +1499,6 @@ void MesosContainerizerProcess::destroy(
     return;
   }
 
-  if (container->state == FETCHING) {
-    fetcher->kill(containerId);
-  }
-
   if (container->state == ISOLATING) {
     VLOG(1) << "Waiting for the isolators to complete for container '"
             << containerId << "'";
@@ -1511,6 +1513,11 @@ void MesosContainerizerProcess::destroy(
     return;
   }
 
+  // Either RUNNING or FETCHING at this point.
+  if (container->state == FETCHING) {
+    fetcher->kill(containerId);
+  }
+
   container->state = DESTROYING;
   _destroy(containerId);
 }


[5/6] mesos git commit: Made Mesos containerizer error messages more consistent.

Posted by an...@apache.org.
Made Mesos containerizer error messages more consistent.

We've been using slightly different wordings of the same condition in
multiple places in Mesos containerizer but they don't provide
additional information about where this failure is thrown in a long
continuation chain. Since failures don't capture the location in the
code we'd better distinguish them in a more meaningful way to assist
debugging.

Review: https://reviews.apache.org/r/49653


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/2d61bde8
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/2d61bde8
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/2d61bde8

Branch: refs/heads/0.28.x
Commit: 2d61bde81e3d6fb7400ec5f7078ceedd8d2bb802
Parents: d7f8b85
Author: Jiang Yan Xu <xu...@apple.com>
Authored: Fri Jul 1 18:12:01 2016 -0700
Committer: Anand Mazumdar <an...@apache.org>
Committed: Tue Nov 22 20:31:26 2016 -0800

----------------------------------------------------------------------
 src/slave/containerizer/mesos/containerizer.cpp | 29 +++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/2d61bde8/src/slave/containerizer/mesos/containerizer.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp
index e902853..3262850 100644
--- a/src/slave/containerizer/mesos/containerizer.cpp
+++ b/src/slave/containerizer/mesos/containerizer.cpp
@@ -770,14 +770,14 @@ Future<bool> MesosContainerizerProcess::_launch(
   // and its dependencies finish before '_launch' starts since onAny
   // is not guaranteed to be executed in order.
   if (!containers_.contains(containerId)) {
-    return Failure("Container has been destroyed");
+    return Failure("Container destroyed during provisioning");
   }
 
   // Make sure containerizer is not in DESTROYING state, to avoid
   // a possible race that containerizer is destroying the container
   // while it is provisioning the image from volumes.
   if (containers_[containerId]->state == DESTROYING) {
-    return Failure("Container is currently being destroyed");
+    return Failure("Container is being destroyed during provisioning");
   }
 
   CHECK_EQ(containers_[containerId]->state, PROVISIONING);
@@ -873,14 +873,14 @@ Future<list<Option<ContainerLaunchInfo>>> MesosContainerizerProcess::prepare(
   // 'prepare' starts since onAny is not guaranteed to be executed
   // in order.
   if (!containers_.contains(containerId)) {
-    return Failure("Container has been destroyed");
+    return Failure("Container destroyed during provisioning");
   }
 
   // Make sure containerizer is not in DESTROYING state, to avoid
   // a possible race that containerizer is destroying the container
   // while it is preparing isolators for the container.
   if (containers_[containerId]->state == DESTROYING) {
-    return Failure("Container is currently being destroyed");
+    return Failure("Container is being destroyed during provisioning");
   }
 
   CHECK_EQ(containers_[containerId]->state, PROVISIONING);
@@ -946,11 +946,11 @@ Future<Nothing> MesosContainerizerProcess::fetch(
     const SlaveID& slaveId)
 {
   if (!containers_.contains(containerId)) {
-    return Failure("Container is already destroyed");
+    return Failure("Container destroyed during isolating");
   }
 
   if (containers_[containerId]->state == DESTROYING) {
-    return Failure("Container is currently being destroyed");
+    return Failure("Container is being destroyed during isolating");
   }
 
   CHECK_EQ(containers_[containerId]->state, ISOLATING);
@@ -980,11 +980,11 @@ Future<bool> MesosContainerizerProcess::__launch(
     const list<Option<ContainerLaunchInfo>>& launchInfos)
 {
   if (!containers_.contains(containerId)) {
-    return Failure("Container has been destroyed");
+    return Failure("Container destroyed during preparing");
   }
 
   if (containers_[containerId]->state == DESTROYING) {
-    return Failure("Container is currently being destroyed");
+    return Failure("Container is being destroyed during preparing");
   }
 
   CHECK_EQ(containers_[containerId]->state, PREPARING);
@@ -1230,11 +1230,11 @@ Future<bool> MesosContainerizerProcess::isolate(
     pid_t _pid)
 {
   if (!containers_.contains(containerId)) {
-    return Failure("Container is already destroyed");
+    return Failure("Container destroyed during preparing");
   }
 
   if (containers_[containerId]->state == DESTROYING) {
-    return Failure("Container is currently being destroyed");
+    return Failure("Container is being destroyed during preparing");
   }
 
   CHECK_EQ(containers_[containerId]->state, PREPARING);
@@ -1271,9 +1271,12 @@ Future<bool> MesosContainerizerProcess::exec(
 {
   // The container may be destroyed before we exec the executor so
   // return failure here.
-  if (!containers_.contains(containerId) ||
-      containers_[containerId]->state == DESTROYING) {
-    return Failure("Container destroyed during launch");
+  if (!containers_.contains(containerId)) {
+    return Failure("Container destroyed during fetching");
+  }
+
+  if (containers_[containerId]->state == DESTROYING) {
+    return Failure("Container is being destroyed during fetching");
   }
 
   CHECK_EQ(containers_[containerId]->state, FETCHING);


[2/6] mesos git commit: Fail container launch if it's destroyed during logger->prepare().

Posted by an...@apache.org.
Fail container launch if it's destroyed during logger->prepare().

Review: https://reviews.apache.org/r/49725


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/90b5be8e
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/90b5be8e
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/90b5be8e

Branch: refs/heads/0.28.x
Commit: 90b5be8e95c5868ea9142625b97050a75d0664f5
Parents: 56b4c56
Author: Jiang Yan Xu <xu...@apple.com>
Authored: Wed Jul 6 13:48:34 2016 -0700
Committer: Anand Mazumdar <an...@apache.org>
Committed: Tue Nov 22 20:30:57 2016 -0800

----------------------------------------------------------------------
 src/slave/containerizer/mesos/containerizer.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/90b5be8e/src/slave/containerizer/mesos/containerizer.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp
index 52906dd..b1dc7a8 100644
--- a/src/slave/containerizer/mesos/containerizer.cpp
+++ b/src/slave/containerizer/mesos/containerizer.cpp
@@ -1221,7 +1221,13 @@ Future<bool> MesosContainerizerProcess::isolate(
     const ContainerID& containerId,
     pid_t _pid)
 {
-  CHECK(containers_.contains(containerId));
+  if (!containers_.contains(containerId)) {
+    return Failure("Container is already destroyed");
+  }
+
+  if (containers_[containerId]->state == DESTROYING) {
+    return Failure("Container is currently being destroyed");
+  }
 
   containers_[containerId]->state = ISOLATING;
 


[4/6] mesos git commit: Improved Mesos containerizer invariant checking.

Posted by an...@apache.org.
Improved Mesos containerizer invariant checking.

One of the reasons for MESOS-5763 is due to the lack invariant
checking. Mesos containerizer transitions the container state in
particular ways so when continuation chains could potentially be
interleaved with other actions we should verify the state transitions.

Review: https://reviews.apache.org/r/49652


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/d7f8b855
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/d7f8b855
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/d7f8b855

Branch: refs/heads/0.28.x
Commit: d7f8b8558974ee8739d460d53faf54a52832b754
Parents: 008e044
Author: Jiang Yan Xu <xu...@apple.com>
Authored: Fri Jul 1 18:11:29 2016 -0700
Committer: Anand Mazumdar <an...@apache.org>
Committed: Tue Nov 22 20:31:17 2016 -0800

----------------------------------------------------------------------
 src/slave/containerizer/mesos/containerizer.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/d7f8b855/src/slave/containerizer/mesos/containerizer.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp
index 59c2cd8..e902853 100644
--- a/src/slave/containerizer/mesos/containerizer.cpp
+++ b/src/slave/containerizer/mesos/containerizer.cpp
@@ -780,6 +780,8 @@ Future<bool> MesosContainerizerProcess::_launch(
     return Failure("Container is currently being destroyed");
   }
 
+  CHECK_EQ(containers_[containerId]->state, PROVISIONING);
+
   // We will provision the images specified in ContainerInfo::volumes
   // as well. We will mutate ContainerInfo::volumes to include the
   // paths to the provisioned root filesystems (by setting the
@@ -881,6 +883,8 @@ Future<list<Option<ContainerLaunchInfo>>> MesosContainerizerProcess::prepare(
     return Failure("Container is currently being destroyed");
   }
 
+  CHECK_EQ(containers_[containerId]->state, PROVISIONING);
+
   containers_[containerId]->state = PREPARING;
 
   // Construct ContainerConfig.
@@ -949,6 +953,8 @@ Future<Nothing> MesosContainerizerProcess::fetch(
     return Failure("Container is currently being destroyed");
   }
 
+  CHECK_EQ(containers_[containerId]->state, ISOLATING);
+
   containers_[containerId]->state = FETCHING;
 
   return fetcher->fetch(
@@ -981,6 +987,8 @@ Future<bool> MesosContainerizerProcess::__launch(
     return Failure("Container is currently being destroyed");
   }
 
+  CHECK_EQ(containers_[containerId]->state, PREPARING);
+
   // Prepare environment variables for the executor.
   map<string, string> environment = executorEnvironment(
       executorInfo,
@@ -1229,6 +1237,8 @@ Future<bool> MesosContainerizerProcess::isolate(
     return Failure("Container is currently being destroyed");
   }
 
+  CHECK_EQ(containers_[containerId]->state, PREPARING);
+
   containers_[containerId]->state = ISOLATING;
 
   // Set up callbacks for isolator limitations.
@@ -1266,6 +1276,8 @@ Future<bool> MesosContainerizerProcess::exec(
     return Failure("Container destroyed during launch");
   }
 
+  CHECK_EQ(containers_[containerId]->state, FETCHING);
+
   // Now that we've contained the child we can signal it to continue
   // by writing to the pipe.
   char dummy;
@@ -1547,6 +1559,8 @@ void MesosContainerizerProcess::destroy(
 void MesosContainerizerProcess::_destroy(
     const ContainerID& containerId)
 {
+  CHECK(containers_.contains(containerId));
+
   // Kill all processes then continue destruction.
   launcher->destroy(containerId)
     .onAny(defer(self(), &Self::__destroy, containerId, lambda::_1));
@@ -1595,6 +1609,8 @@ void MesosContainerizerProcess::___destroy(
     const Future<Option<int>>& status,
     const Option<string>& message)
 {
+  CHECK(containers_.contains(containerId));
+
   cleanupIsolators(containerId)
     .onAny(defer(self(),
                  &Self::____destroy,


[6/6] mesos git commit: Added MESOS-5763 to 0.28.3 CHANGELOG.

Posted by an...@apache.org.
Added MESOS-5763 to 0.28.3 CHANGELOG.


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/52a0b0a4
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/52a0b0a4
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/52a0b0a4

Branch: refs/heads/0.28.x
Commit: 52a0b0a41482da35dc736ec2fd445b6099e7a4e7
Parents: 2d61bde
Author: Anand Mazumdar <an...@apache.org>
Authored: Tue Nov 22 20:38:43 2016 -0800
Committer: Anand Mazumdar <an...@apache.org>
Committed: Tue Nov 22 20:46:02 2016 -0800

----------------------------------------------------------------------
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/52a0b0a4/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index e9af58b..cfc5f80 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -18,6 +18,7 @@ All Issues:
   * [MESOS-5723] - SSL-enabled libprocess will leak incoming links to forks.
   * [MESOS-5740] - Consider adding `relink` functionality to libprocess.
   * [MESOS-5748] - Potential segfault in `link` when linking to a remote process.
+  * [MESOS-5763] - Task stuck in fetching is not cleaned up after --executor_registration_timeout.
   * [MESOS-5913] - Stale socket FD usage when using libevent + SSL.
   * [MESOS-5927] - Unable to run "scratch" Dockerfiles with Unified Containerizer.
   * [MESOS-5943] - Incremental http parsing of URLs leads to decoder error.


[3/6] mesos git commit: Improved Mesos containerizer logging and documentation.

Posted by an...@apache.org.
Improved Mesos containerizer logging and documentation.

Review: https://reviews.apache.org/r/49651


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/008e0443
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/008e0443
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/008e0443

Branch: refs/heads/0.28.x
Commit: 008e04433026aaec49779197c4a7b6655d5bb693
Parents: 90b5be8
Author: Jiang Yan Xu <xu...@apple.com>
Authored: Fri Jul 1 15:25:54 2016 -0700
Committer: Anand Mazumdar <an...@apache.org>
Committed: Tue Nov 22 20:31:05 2016 -0800

----------------------------------------------------------------------
 src/slave/containerizer/mesos/containerizer.cpp | 21 +++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/008e0443/src/slave/containerizer/mesos/containerizer.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp
index b1dc7a8..59c2cd8 100644
--- a/src/slave/containerizer/mesos/containerizer.cpp
+++ b/src/slave/containerizer/mesos/containerizer.cpp
@@ -1288,7 +1288,10 @@ Future<containerizer::Termination> MesosContainerizerProcess::wait(
     const ContainerID& containerId)
 {
   if (!containers_.contains(containerId)) {
-    return Failure("Unknown container: " + stringify(containerId));
+    // See the comments in destroy() for race conditions which lead
+    // to "unknown containers".
+    return Failure("Unknown container (could have already been destroyed): " +
+                   stringify(containerId));
   }
 
   return containers_[containerId]->promise.future();
@@ -1451,14 +1454,26 @@ void MesosContainerizerProcess::destroy(
     const ContainerID& containerId)
 {
   if (!containers_.contains(containerId)) {
-    LOG(WARNING) << "Ignoring destroy of unknown container: " << containerId;
+    // This can happen due to the race between destroys initiated by
+    // the launch failure, the terminated executor and the agent so
+    // the same container is destroyed multiple times in reaction to
+    // one failure. e.g., a stuck fetcher results in:
+    // - The agent invoking destroy(), which kills the fetcher and
+    //   the executor.
+    // - The agent invoking destroy() again for the failed launch
+    //   (due to the fetcher getting killed).
+    // - The containerizer invoking destroy() for the reaped executor.
+    //
+    // The guard here and `if (container->state == DESTROYING)` below
+    // make sure redundant destroys short-circuit.
+    VLOG(1) << "Ignoring destroy of unknown container: " << containerId;
     return;
   }
 
   Container* container = containers_[containerId].get();
 
   if (container->state == DESTROYING) {
-    // Destroy has already been initiated.
+    VLOG(1) << "Destroy has already been initiated for '" << containerId << "'";
     return;
   }