You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by qi...@apache.org on 2018/02/02 08:59:33 UTC
[1/3] mesos git commit: Fixed a coding error in a log message of
Docker containerizer.
Repository: mesos
Updated Branches:
refs/heads/1.5.x f7e3872b0 -> 8a9f99e9f
Fixed a coding error in a log message of Docker containerizer.
Review: https://reviews.apache.org/r/65420
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/cb01bb21
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/cb01bb21
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/cb01bb21
Branch: refs/heads/1.5.x
Commit: cb01bb21a51377acbae021c455645422eb07199c
Parents: c12eee0
Author: Qian Zhang <zh...@gmail.com>
Authored: Mon Jan 29 16:20:23 2018 +0800
Committer: Qian Zhang <zh...@gmail.com>
Committed: Fri Feb 2 09:48:03 2018 +0800
----------------------------------------------------------------------
src/slave/containerizer/docker.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/cb01bb21/src/slave/containerizer/docker.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/docker.cpp b/src/slave/containerizer/docker.cpp
index 5f026f0..7585178 100644
--- a/src/slave/containerizer/docker.cpp
+++ b/src/slave/containerizer/docker.cpp
@@ -2489,7 +2489,7 @@ Future<Nothing> DockerContainerizerProcess::destroyTimeout(
// Ignoring the error from killing process as it can already
// have exited.
VLOG(1) << "Ignoring error when killing process pid "
- << container->executorPid.get() << " in destroy, error: "
+ << container->pid.get() << " in destroy, error: "
<< kill.error();
}
}
[2/3] mesos git commit: Reaped Docker executor only when it can be
connected.
Posted by qi...@apache.org.
Reaped Docker executor only when it can be connected.
Review: https://reviews.apache.org/r/65382
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/c12eee06
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/c12eee06
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/c12eee06
Branch: refs/heads/1.5.x
Commit: c12eee066f70c6a2c666e1fa81da2fdea3fcc3fd
Parents: f7e3872
Author: Qian Zhang <zh...@gmail.com>
Authored: Sun Jan 28 20:57:23 2018 +0800
Committer: Qian Zhang <zh...@gmail.com>
Committed: Fri Feb 2 09:48:03 2018 +0800
----------------------------------------------------------------------
src/slave/containerizer/docker.cpp | 50 ++++++++++++++++++++++++++++-----
1 file changed, 43 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/c12eee06/src/slave/containerizer/docker.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/docker.cpp b/src/slave/containerizer/docker.cpp
index f1d7d3e..5f026f0 100644
--- a/src/slave/containerizer/docker.cpp
+++ b/src/slave/containerizer/docker.cpp
@@ -28,6 +28,7 @@
#include <process/defer.hpp>
#include <process/delay.hpp>
#include <process/io.hpp>
+#include <process/network.hpp>
#include <process/owned.hpp>
#include <process/reap.hpp>
#include <process/subprocess.hpp>
@@ -969,12 +970,14 @@ Future<Nothing> DockerContainerizerProcess::_recover(
CHECK_SOME(run.get().id);
CHECK_EQ(containerId, run.get().id.get());
- // We need the pid so the reaper can monitor the executor so
- // skip this executor if it's not present. This is not an
- // error because the slave will try to wait on the container
- // which will return a failed 'ContainerTermination' and
- // everything will get cleaned up.
- if (!run.get().forkedPid.isSome()) {
+ // We need the pid so the reaper can monitor the executor so skip this
+ // executor if it's not present. We will also skip this executor if the
+ // libprocess pid is not present which means the slave exited before
+ // checkpointing it, in which case the executor will shutdown itself
+ // immediately. Both of these two cases are safe to skip because the
+ // slave will try to wait on the container which will return `None()`
+ // and everything will get cleaned up.
+ if (run.get().forkedPid.isNone() || run.get().libprocessPid.isNone()) {
continue;
}
@@ -1020,9 +1023,42 @@ Future<Nothing> DockerContainerizerProcess::_recover(
container->containerName = existingContainers.at(containerId);
}
+ // Only reap the executor process if the executor can be connected
+ // otherwise just set `container->status` to `None()`. This is to
+ // avoid reaping an irrelevant process, e.g., after the agent host is
+ // rebooted, the executor pid happens to be reused by another process.
+ // See MESOS-8125 for details.
+ // Note that if both the pid and the port of the executor are reused
+ // by another process or two processes respectively after the agent
+ // host reboots we will still reap an irrelevant process, but that
+ // should be highly unlikely.
pid_t pid = run.get().forkedPid.get();
- container->status.set(process::reap(pid));
+ // Create a TCP socket.
+ int_fd socket = ::socket(AF_INET, SOCK_STREAM, 0);
+ if (socket < 0) {
+ return Failure(
+ "Failed to create socket for connecting to executor '" +
+ stringify(executor.id) + "': " + os::strerror(errno));
+ }
+
+ Try<Nothing, SocketError> connect = process::network::connect(
+ socket,
+ run.get().libprocessPid->address);
+
+ if (connect.isSome()) {
+ container->status.set(process::reap(pid));
+ } else {
+ LOG(WARNING) << "Failed to connect to executor '" << executor.id
+ << "' of framework " << framework.id << ": "
+ << connect.error().message;
+
+ container->status.set(Future<Option<int>>(None()));
+ }
+
+ // Shutdown and close the socket.
+ shutdown(socket, SHUT_RDWR);
+ close(socket);
container->status.future().get()
.onAny(defer(self(), &Self::reaped, containerId));
[3/3] mesos git commit: Moved MESOS-8125 to the 1.5.0 CHANGELOG.
Posted by qi...@apache.org.
Moved MESOS-8125 to the 1.5.0 CHANGELOG.
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/8a9f99e9
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/8a9f99e9
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/8a9f99e9
Branch: refs/heads/1.5.x
Commit: 8a9f99e9f1f4e28657cff2901e3eceac5eccf4b8
Parents: cb01bb2
Author: Qian Zhang <zh...@gmail.com>
Authored: Fri Feb 2 16:51:32 2018 +0800
Committer: Qian Zhang <zh...@gmail.com>
Committed: Fri Feb 2 16:55:11 2018 +0800
----------------------------------------------------------------------
CHANGELOG | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/8a9f99e9/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index 449d5b1..0b606de 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -134,7 +134,6 @@ Unresolved Critical Issues:
* [MESOS-7966] - check for maintenance on agent causes fatal error.
* [MESOS-7991] - fatal, check failed !framework->recovered().
* [MESOS-8038] - Launching GPU task sporadically fails.
- * [MESOS-8125] - Agent should properly handle recovering an executor when its pid is reused.
* [MESOS-8137] - Mesos agent can hang during startup.
* [MESOS-8256] - Libprocess can silently deadlock due to worker thread exhaustion.
* [MESOS-8411] - Killing a queued task can lead to the command executor never terminating.
@@ -249,6 +248,7 @@ All Resolved Issues:
* [MESOS-8119] - ROOT_DOCKER_DockerHealthyTask segfaults in debian 8.
* [MESOS-8121] - Unified Containerizer Auto backend should check xfs ftype for overlayfs backend.
* [MESOS-8123] - GPU tests are failing due to TASK_STARTING.
+ * [MESOS-8125] - Agent should properly handle recovering an executor when its pid is reused.
* [MESOS-8135] - Masters can lose track of tasks' executor IDs.
* [MESOS-8136] - Update XFS isolator tests to handle TASK_STARTING.
* [MESOS-8157] - Review #62775 broke the build.