You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ji...@apache.org on 2016/12/07 00:07:43 UTC
[1/6] mesos git commit: Added function to unblock IOSwitchboard when
waiting for connection.
Repository: mesos
Updated Branches:
refs/heads/master 0228fa74c -> c18fd732d
Added function to unblock IOSwitchboard when waiting for connection.
Review: https://reviews.apache.org/r/54441/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/85f0c462
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/85f0c462
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/85f0c462
Branch: refs/heads/master
Commit: 85f0c462ad807f2842d9debeb8ee8ce0cc0c7fe4
Parents: 0228fa7
Author: Kevin Klues <kl...@gmail.com>
Authored: Tue Dec 6 15:21:42 2016 -0800
Committer: Jie Yu <yu...@gmail.com>
Committed: Tue Dec 6 15:21:42 2016 -0800
----------------------------------------------------------------------
src/slave/containerizer/mesos/io/switchboard.cpp | 15 +++++++++++++++
src/slave/containerizer/mesos/io/switchboard.hpp | 5 +++++
2 files changed, 20 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/85f0c462/src/slave/containerizer/mesos/io/switchboard.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/io/switchboard.cpp b/src/slave/containerizer/mesos/io/switchboard.cpp
index 76400da..e0627cb 100644
--- a/src/slave/containerizer/mesos/io/switchboard.cpp
+++ b/src/slave/containerizer/mesos/io/switchboard.cpp
@@ -605,6 +605,8 @@ public:
Future<Nothing> run();
+ Future<Nothing> unblock();
+
private:
class HttpConnection
{
@@ -764,6 +766,12 @@ Future<Nothing> IOSwitchboardServer::run()
}
+Future<Nothing> IOSwitchboardServer::unblock()
+{
+ return dispatch(process.get(), &IOSwitchboardServerProcess::unblock);
+}
+
+
IOSwitchboardServerProcess::IOSwitchboardServerProcess(
bool _tty,
int _stdinToFd,
@@ -879,6 +887,13 @@ Future<Nothing> IOSwitchboardServerProcess::run()
}
+Future<Nothing> IOSwitchboardServerProcess::unblock()
+{
+ startRedirect.set(Nothing());
+ return Nothing();
+}
+
+
void IOSwitchboardServerProcess::finalize()
{
foreach (HttpConnection& connection, outputConnections) {
http://git-wip-us.apache.org/repos/asf/mesos/blob/85f0c462/src/slave/containerizer/mesos/io/switchboard.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/io/switchboard.hpp b/src/slave/containerizer/mesos/io/switchboard.hpp
index 839665a..ccf2a1b 100644
--- a/src/slave/containerizer/mesos/io/switchboard.hpp
+++ b/src/slave/containerizer/mesos/io/switchboard.hpp
@@ -134,8 +134,13 @@ public:
~IOSwitchboardServer();
+ // Run the io switchboard server.
process::Future<Nothing> run();
+ // Forcibly unblock the io switchboard server if it
+ // has been started with `waitForConnection` set to `true`.
+ process::Future<Nothing> unblock();
+
private:
IOSwitchboardServer(
bool tty,
[6/6] mesos git commit: Added helpers to checkpoint a
'destroy-on-recovery' file for containers.
Posted by ji...@apache.org.
Added helpers to checkpoint a 'destroy-on-recovery' file for containers.
The existence of this file causes the containerizer to mark the
container for destruction after agent restarts. This is currently
useful for DEBUG containers launched by a
`LAUNCH_NESTED_CONTAINER_SESSION` call.
Review: https://reviews.apache.org/r/54368/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/c18fd732
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/c18fd732
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/c18fd732
Branch: refs/heads/master
Commit: c18fd732de72a45aca8af3c01c18794afefec15a
Parents: 575e196
Author: Kevin Klues <kl...@gmail.com>
Authored: Tue Dec 6 15:21:56 2016 -0800
Committer: Jie Yu <yu...@gmail.com>
Committed: Tue Dec 6 15:21:56 2016 -0800
----------------------------------------------------------------------
src/slave/containerizer/mesos/paths.cpp | 25 +++++++++++++++++++++++++
src/slave/containerizer/mesos/paths.hpp | 14 ++++++++++++++
2 files changed, 39 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/c18fd732/src/slave/containerizer/mesos/paths.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/paths.cpp b/src/slave/containerizer/mesos/paths.cpp
index c554287..c1770ce 100644
--- a/src/slave/containerizer/mesos/paths.cpp
+++ b/src/slave/containerizer/mesos/paths.cpp
@@ -230,6 +230,31 @@ Result<unix::Address> getContainerIOSwitchboardAddress(
#endif // __WINDOWS__
+std::string getContainerForceDestroyOnRecoveryPath(
+ const std::string& runtimeDir,
+ const ContainerID& containerId)
+{
+ return path::join(
+ getRuntimePath(runtimeDir, containerId),
+ FORCE_DESTROY_ON_RECOVERY_FILE);
+}
+
+
+bool getContainerForceDestroyOnRecovery(
+ const std::string& runtimeDir,
+ const ContainerID& containerId)
+{
+ const string path = getContainerForceDestroyOnRecoveryPath(
+ runtimeDir, containerId);
+
+ if (os::exists(path)) {
+ return true;
+ }
+
+ return false;
+}
+
+
Result<ContainerTermination> getContainerTermination(
const string& runtimeDir,
const ContainerID& containerId)
http://git-wip-us.apache.org/repos/asf/mesos/blob/c18fd732/src/slave/containerizer/mesos/paths.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/paths.hpp b/src/slave/containerizer/mesos/paths.hpp
index dd197f9..d85fd34 100644
--- a/src/slave/containerizer/mesos/paths.hpp
+++ b/src/slave/containerizer/mesos/paths.hpp
@@ -41,6 +41,7 @@ constexpr char PID_FILE[] = "pid";
constexpr char STATUS_FILE[] = "status";
constexpr char TERMINATION_FILE[] = "termination";
constexpr char SOCKET_FILE[] = "socket";
+constexpr char FORCE_DESTROY_ON_RECOVERY_FILE[] = "force_destroy_on_recovery";
constexpr char IO_SWITCHBOARD_DIRECTORY[] = "io_switchboard";
constexpr char CONTAINER_DIRECTORY[] = "containers";
@@ -129,6 +130,19 @@ Result<process::network::unix::Address> getContainerIOSwitchboardAddress(
#endif
+// The helper method to get the destroy on recovery file path.
+std::string getContainerForceDestroyOnRecoveryPath(
+ const std::string& runtimeDir,
+ const ContainerID& containerId);
+
+
+// The helper method to check if we should
+// destroy a container on recovery or not.
+bool getContainerForceDestroyOnRecovery(
+ const std::string& runtimeDir,
+ const ContainerID& containerId);
+
+
// The helper method to read the container termination state.
Result<mesos::slave::ContainerTermination> getContainerTermination(
const std::string& runtimeDir,
[3/6] mesos git commit: Added implementation of `recover()` to the
IOSwitchboard isolator.
Posted by ji...@apache.org.
Added implementation of `recover()` to the IOSwitchboard isolator.
Review: https://reviews.apache.org/r/54355/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/4069e1c4
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/4069e1c4
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/4069e1c4
Branch: refs/heads/master
Commit: 4069e1c4248302a0b72201c674b60f1bb988d276
Parents: ff553e8
Author: Kevin Klues <kl...@gmail.com>
Authored: Tue Dec 6 15:21:47 2016 -0800
Committer: Jie Yu <yu...@gmail.com>
Committed: Tue Dec 6 15:21:47 2016 -0800
----------------------------------------------------------------------
.../containerizer/mesos/io/switchboard.cpp | 169 ++++++++++++++++---
.../containerizer/mesos/io/switchboard.hpp | 8 +-
2 files changed, 149 insertions(+), 28 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/4069e1c4/src/slave/containerizer/mesos/io/switchboard.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/io/switchboard.cpp b/src/slave/containerizer/mesos/io/switchboard.cpp
index 98f6d95..8d4f1e9 100644
--- a/src/slave/containerizer/mesos/io/switchboard.cpp
+++ b/src/slave/containerizer/mesos/io/switchboard.cpp
@@ -90,6 +90,7 @@ using mesos::slave::ContainerClass;
using mesos::slave::ContainerIO;
using mesos::slave::ContainerLaunchInfo;
using mesos::slave::ContainerLogger;
+using mesos::slave::ContainerState;
using mesos::slave::Isolator;
namespace mesos {
@@ -132,6 +133,108 @@ bool IOSwitchboard::supportsNesting()
}
+Future<Nothing> IOSwitchboard::recover(
+ const list<ContainerState>& states,
+ const hashset<ContainerID>& orphans)
+{
+#ifdef __WINDOWS__
+ return Nothing();
+#else
+ if (local) {
+ return Nothing();
+ }
+
+ // Recover any active container's io switchboard info.
+ //
+ // NOTE: If a new agent is started with io switchboard server mode
+ // disabled, we will still recover the io switchboard info for
+ // containers previously launched by an agent with server mode enabled.
+ foreach (const ContainerState& state, states) {
+ const ContainerID& containerId = state.container_id();
+
+ const string path = containerizer::paths::getContainerIOSwitchboardPath(
+ flags.runtime_dir, containerId);
+
+ // If we don't have a checkpoint directory created for this
+ // container's io switchboard, there is nothing to recover. This
+ // can only happen for legacy containers, containers that were
+ // launched with `--io_switchboard_enable_server=false`.
+ if (!os::exists(path)) {
+ continue;
+ }
+
+ Result<pid_t> pid = containerizer::paths::getContainerIOSwitchboardPid(
+ flags.runtime_dir, containerId);
+
+ // For active containers that have an io switchboard directory,
+ // we should *always* have a valid pid file. If we don't that is a
+ // an error and we should fail appropriately.
+ if (!pid.isSome()) {
+ return Failure("Failed to get I/O switchboard server pid for"
+ " '" + stringify(containerId) + "':"
+ " " + (pid.isError() ?
+ pid.error() :
+ "pid file does not exist"));
+ }
+
+ infos[containerId] = Owned<Info>(new Info(
+ pid.get(),
+ process::reap(pid.get())));
+ }
+
+ // Recover the io switchboards from any orphaned containers.
+ foreach (const ContainerID& orphan, orphans) {
+ const string path = containerizer::paths::getContainerIOSwitchboardPath(
+ flags.runtime_dir, orphan);
+
+ // If we don't have a checkpoint directory created for this
+ // container's io switchboard, there is nothing to recover.
+ if (!os::exists(path)) {
+ continue;
+ }
+
+ Result<pid_t> pid = containerizer::paths::getContainerIOSwitchboardPid(
+ flags.runtime_dir, orphan);
+
+ // If we were able to retrieve the checkpointed pid, we simply
+ // populate our info struct and rely on the containerizer to
+ // destroy the orphaned container and call `cleanup()` on us later.
+ if (pid.isSome()) {
+ infos[orphan] = Owned<Info>(new Info(
+ pid.get(),
+ process::reap(pid.get())));
+ } else {
+ // If we were not able to retrieve the checkpointed pid, we
+ // still need to populate our info struct (but with a pid value
+ // of `None()`). This way when `cleanup()` is called, we still
+ // do whatever cleanup we can (we just don't wait for the pid
+ // to be reaped -- we do it immediately).
+ //
+ // We could enter this case under 4 conditions:
+ //
+ // (1) The io switchboard we are recovering was launched, but
+ // the agent died before checkpointing its pid.
+ // (2) The io switchboard pid file was removed.
+ // (3) There was an error reading the io switchbaord pid file.
+ // (4) The io switchboard pid file was corrupted.
+ //
+ // We log an error in cases (3) and (4).
+ infos[orphan] = Owned<Info>(new Info(
+ None(),
+ Future<Option<int>>(None())));
+
+ if (pid.isError()) {
+ LOG(ERROR) << "Error retrieving the 'IOSwitchboard' pid file"
+ " for orphan '" << orphan << "': " << pid.error();
+ }
+ }
+ }
+
+ return Nothing();
+#endif // __WINDOWS__
+}
+
+
Future<Option<ContainerLaunchInfo>> IOSwitchboard::prepare(
const ContainerID& containerId,
const ContainerConfig& containerConfig)
@@ -230,22 +333,6 @@ Future<Option<ContainerLaunchInfo>> IOSwitchboard::_prepare(
" '" + stringify(containerId) + "'");
}
- // We start by creating a directory to hold checkpointed files
- // related to the io switchboard server we are about to launch. The
- // existence of this directory indicates that we intended to launch
- // a server on behalf of a container. The lack of any expected files
- // in this directroy during recovery/cleanup indicates that
- // something went wrong and we need to take appropriate action.
- string path = containerizer::paths::getContainerIOSwitchboardPath(
- flags.runtime_dir, containerId);
-
- Try<Nothing> mkdir = os::mkdir(path);
- if (mkdir.isError()) {
- return Failure("Error creating 'IOSwitchboard' checkpoint directory"
- " for container '" + stringify(containerId) + "':"
- " " + mkdir.error());
- }
-
// Return the set of fds that should be sent to the
// container and dup'd onto its stdin/stdout/stderr.
ContainerLaunchInfo launchInfo;
@@ -428,6 +515,23 @@ Future<Option<ContainerLaunchInfo>> IOSwitchboard::_prepare(
"tmp",
"mesos-io-switchboard-" + UUID::random().toString());
+ // Just before launching our io switchboard server, we need to
+ // create a directory to hold checkpointed files related to the
+ // server. The existence of this directory indicates that we
+ // intended to launch an io switchboard server on behalf of a
+ // container. The lack of any expected files in this directroy
+ // during recovery/cleanup indicates that something went wrong and
+ // we need to take appropriate action.
+ string path = containerizer::paths::getContainerIOSwitchboardPath(
+ flags.runtime_dir, containerId);
+
+ Try<Nothing> mkdir = os::mkdir(path);
+ if (mkdir.isError()) {
+ return Failure("Error creating 'IOSwitchboard' checkpoint directory"
+ " for container '" + stringify(containerId) + "':"
+ " " + mkdir.error());
+ }
+
// Launch the io switchboard server process.
// We `dup()` the `stdout` and `stderr` passed to us by the
// container logger over the `stdout` and `stderr` of the io
@@ -483,6 +587,18 @@ Future<Option<ContainerLaunchInfo>> IOSwitchboard::_prepare(
" '" + path + "': " + checkpointed.error());
}
+ // We also checkpoint the child's pid.
+ path = containerizer::paths::getContainerIOSwitchboardPidPath(
+ flags.runtime_dir, containerId);
+
+ checkpointed = slave::state::checkpoint(path, stringify(child->pid()));
+
+ if (checkpointed.isError()) {
+ close(openedFds);
+ return Failure("Failed to checkpoint container's io switchboard pid to"
+ " '" + path + "': " + checkpointed.error());
+ }
+
// Build an info struct for this container.
infos[containerId] = Owned<Info>(new Info(
child->pid(),
@@ -537,16 +653,17 @@ Future<Nothing> IOSwitchboard::cleanup(
// windows yet, there is nothing to wait for here.
return Nothing();
#else
- // We don't particularly care if the process gets reaped or not (it
- // will clean itself up automatically upon process exit). We just
- // try to wait for it to exit if we can. For now there is no need
- // to recover info about a container's IOSwitchboard across agent
- // restarts (so it's OK to simply return `Nothing()` if we don't
- // know about a containerId).
- //
- // TODO(klueska): Add the ability to recover the `IOSwitchboard`'s
- // pid and reap it so we can properly return its status here.
- if (local || !infos.contains(containerId)) {
+ if (local) {
+ return Nothing();
+ }
+
+ // We ignore unknown containers here because legacy containers
+ // without an io switchboard directory will not have an info struct
+ // created for them during recovery. Likewise, containers launched
+ // by a previous agent with io switchboard server mode disabled will
+ // not have info structs created for them either. In both cases
+ // there is nothing to cleanup, so we simly return `Nothing()`.
+ if (!infos.contains(containerId)) {
return Nothing();
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/4069e1c4/src/slave/containerizer/mesos/io/switchboard.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/io/switchboard.hpp b/src/slave/containerizer/mesos/io/switchboard.hpp
index ccf2a1b..ca0eee6 100644
--- a/src/slave/containerizer/mesos/io/switchboard.hpp
+++ b/src/slave/containerizer/mesos/io/switchboard.hpp
@@ -60,6 +60,10 @@ public:
virtual bool supportsNesting();
+ virtual process::Future<Nothing> recover(
+ const std::list<mesos::slave::ContainerState>& states,
+ const hashset<ContainerID>& orphans);
+
virtual process::Future<Option<mesos::slave::ContainerLaunchInfo>> prepare(
const ContainerID& containerId,
const mesos::slave::ContainerConfig& containerConfig);
@@ -73,11 +77,11 @@ public:
private:
struct Info
{
- Info(pid_t _pid, const process::Future<Option<int>>& _status)
+ Info(Option<pid_t> _pid, const process::Future<Option<int>>& _status)
: pid(_pid),
status(_status) {}
- pid_t pid;
+ Option<pid_t> pid;
process::Future<Option<int>> status;
};
[2/6] mesos git commit: Added SIGTERM handler to gracefully shutdown
IOSwitchboard server.
Posted by ji...@apache.org.
Added SIGTERM handler to gracefully shutdown IOSwitchboard server.
When receiving a SIGTERM, the io switchboard process will forcibly
unblock the server from waiting on a connection before attempting to
drain its `stdoutFromFd` and `stderrFromFd` file descriptors. Once
these fds are drained (or they become invalid), the server will shut
itself down as per the normal exit route.
Review: https://reviews.apache.org/r/54442/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/ff553e86
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/ff553e86
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/ff553e86
Branch: refs/heads/master
Commit: ff553e86ade41a47121e6aaf8a5e5fdee1b4e9e6
Parents: 85f0c46
Author: Kevin Klues <kl...@gmail.com>
Authored: Tue Dec 6 15:21:45 2016 -0800
Committer: Jie Yu <yu...@gmail.com>
Committed: Tue Dec 6 15:21:45 2016 -0800
----------------------------------------------------------------------
.../containerizer/mesos/io/switchboard.cpp | 16 ++++++
.../containerizer/mesos/io/switchboard_main.cpp | 56 ++++++++++++++++++++
2 files changed, 72 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/ff553e86/src/slave/containerizer/mesos/io/switchboard.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/io/switchboard.cpp b/src/slave/containerizer/mesos/io/switchboard.cpp
index e0627cb..98f6d95 100644
--- a/src/slave/containerizer/mesos/io/switchboard.cpp
+++ b/src/slave/containerizer/mesos/io/switchboard.cpp
@@ -550,10 +550,26 @@ Future<Nothing> IOSwitchboard::cleanup(
return Nothing();
}
+ Option<pid_t> pid = infos[containerId]->pid;
Future<Option<int>> status = infos[containerId]->status;
infos.erase(containerId);
+ // If we have a pid, then we attempt to send it a SIGTERM to have it
+ // shutdown gracefully. This is best effort, as it's likely that the
+ // switchboard has already shutdown in the common case.
+ //
+ // NOTE: There is an unfortunate race condition here. If the io
+ // switchboard terminates and the pid is reused by some other
+ // process, we might be sending SIGTERM to a random process. This
+ // could be a problem under high load.
+ //
+ // TODO(klueska): Send a message over the io switchboard server's
+ // domain socket instead of using a signal.
+ if (pid.isSome()) {
+ os::kill(pid.get(), SIGTERM);
+ }
+
return status
.then(defer(self(), [this, containerId]() {
// Best effort removal of the unix domain socket file created for
http://git-wip-us.apache.org/repos/asf/mesos/blob/ff553e86/src/slave/containerizer/mesos/io/switchboard_main.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/io/switchboard_main.cpp b/src/slave/containerizer/mesos/io/switchboard_main.cpp
index aff6c21..6ce4cdb 100644
--- a/src/slave/containerizer/mesos/io/switchboard_main.cpp
+++ b/src/slave/containerizer/mesos/io/switchboard_main.cpp
@@ -15,17 +15,48 @@
// limitations under the License.
#include <process/future.hpp>
+#include <process/io.hpp>
#include <process/owned.hpp>
+#include <stout/os.hpp>
#include <stout/try.hpp>
#include "slave/containerizer/mesos/io/switchboard.hpp"
+namespace io = process::io;
+
using namespace mesos::internal::slave;
using process::Future;
using process::Owned;
+// We use a pipe to forcibly unblock an io switchboard server and
+// cause it to gracefully shutdown after receiving a SIGTERM signal.
+//
+// TODO(klueska): Ideally we would use `libevent` or `libev`s built in
+// support to defer a signal handler to a thread, but we currently
+// don't expose this through libprocess. Once we do expose this, we
+// should change this logic to use it.
+int unblockFds[2];
+
+
+static void sigtermHandler(int sig)
+{
+ int write = -1;
+ do {
+ write = ::write(unblockFds[1], "\0", 1);
+ } while (write == -1 && errno == EINTR);
+
+ ::close(unblockFds[1]);
+
+ if (write == -1) {
+ const char error[] = "Failed to terminate io switchboard gracefully\n";
+ ::write(STDERR_FILENO, error, sizeof(error));
+ ::_exit(EXIT_FAILURE);
+ }
+}
+
+
int main(int argc, char** argv)
{
IOSwitchboardServerFlags flags;
@@ -37,6 +68,17 @@ int main(int argc, char** argv)
EXIT(EXIT_FAILURE) << flags.usage(load.error());
}
+ Try<Nothing> pipe = os::pipe(unblockFds);
+ if (pipe.isError()) {
+ EXIT(EXIT_FAILURE) << "Failed to create pipe for signaling unblock:"
+ << " " + pipe.error();
+ }
+
+ if (os::signals::install(SIGTERM, sigtermHandler) != 0) {
+ EXIT(EXIT_FAILURE) << "Failed to register signal"
+ << " '" + stringify(strsignal(SIGTERM)) << "'";
+ }
+
Try<Owned<IOSwitchboardServer>> server = IOSwitchboardServer::create(
flags.tty,
flags.stdin_to_fd,
@@ -52,6 +94,20 @@ int main(int argc, char** argv)
" " << server.error();
}
+ io::poll(unblockFds[0], io::READ)
+ .onAny([server](const Future<short>& future) {
+ os::close(unblockFds[0]);
+
+ if (!future.isReady()) {
+ EXIT(EXIT_FAILURE) << "Failed while polling on 'unblockFds[0]': "
+ << (future.isFailed() ?
+ future.failure() :
+ "discarded");
+ }
+
+ server.get()->unblock();
+ });
+
Future<Nothing> run = server.get()->run();
run.await();
[4/6] mesos git commit: Added implementation of `watch()` to the
'IOSwitchboard' isolator.
Posted by ji...@apache.org.
Added implementation of `watch()` to the 'IOSwitchboard' isolator.
We use watch() to trigger the destruction of a container if the
io switchboard server process ever exits unexpectedly.
Review: https://reviews.apache.org/r/54356/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/052abf15
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/052abf15
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/052abf15
Branch: refs/heads/master
Commit: 052abf154510a9047019197e37857e8e26daab72
Parents: 4069e1c
Author: Kevin Klues <kl...@gmail.com>
Authored: Tue Dec 6 15:21:50 2016 -0800
Committer: Jie Yu <yu...@gmail.com>
Committed: Tue Dec 6 15:21:50 2016 -0800
----------------------------------------------------------------------
include/mesos/mesos.proto | 1 +
include/mesos/v1/mesos.proto | 1 +
.../containerizer/mesos/io/switchboard.cpp | 54 +++++++++++++++
.../containerizer/mesos/io/switchboard.hpp | 4 ++
.../containerizer/io_switchboard_tests.cpp | 72 ++++++++++++++++++++
5 files changed, 132 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/052abf15/include/mesos/mesos.proto
----------------------------------------------------------------------
diff --git a/include/mesos/mesos.proto b/include/mesos/mesos.proto
index 8279225..8b311fb 100644
--- a/include/mesos/mesos.proto
+++ b/include/mesos/mesos.proto
@@ -1649,6 +1649,7 @@ message TaskStatus {
REASON_GC_ERROR = 4;
REASON_INVALID_FRAMEWORKID = 5;
REASON_INVALID_OFFERS = 6;
+ REASON_IO_SWITCHBOARD_EXITED = 27;
REASON_MASTER_DISCONNECTED = 7;
REASON_RECONCILIATION = 9;
REASON_RESOURCES_UNKNOWN = 18;
http://git-wip-us.apache.org/repos/asf/mesos/blob/052abf15/include/mesos/v1/mesos.proto
----------------------------------------------------------------------
diff --git a/include/mesos/v1/mesos.proto b/include/mesos/v1/mesos.proto
index 4628cc6..70fab66 100644
--- a/include/mesos/v1/mesos.proto
+++ b/include/mesos/v1/mesos.proto
@@ -1648,6 +1648,7 @@ message TaskStatus {
REASON_GC_ERROR = 4;
REASON_INVALID_FRAMEWORKID = 5;
REASON_INVALID_OFFERS = 6;
+ REASON_IO_SWITCHBOARD_EXITED = 27;
REASON_MASTER_DISCONNECTED = 7;
REASON_RECONCILIATION = 9;
REASON_RESOURCES_UNKNOWN = 18;
http://git-wip-us.apache.org/repos/asf/mesos/blob/052abf15/src/slave/containerizer/mesos/io/switchboard.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/io/switchboard.cpp b/src/slave/containerizer/mesos/io/switchboard.cpp
index 8d4f1e9..e1ca695 100644
--- a/src/slave/containerizer/mesos/io/switchboard.cpp
+++ b/src/slave/containerizer/mesos/io/switchboard.cpp
@@ -89,6 +89,7 @@ using mesos::slave::ContainerConfig;
using mesos::slave::ContainerClass;
using mesos::slave::ContainerIO;
using mesos::slave::ContainerLaunchInfo;
+using mesos::slave::ContainerLimitation;
using mesos::slave::ContainerLogger;
using mesos::slave::ContainerState;
using mesos::slave::Isolator;
@@ -645,6 +646,59 @@ Future<http::Connection> IOSwitchboard::connect(
}
+Future<ContainerLimitation> IOSwitchboard::watch(
+ const ContainerID& containerId)
+{
+#ifdef __WINDOWS__
+ return Future<ContainerLimitation>();
+#else
+ if (local) {
+ return Future<ContainerLimitation>();
+ }
+
+ // We ignore unknown containers here because legacy containers
+ // without an io switchboard directory will not have an info struct
+ // created for during recovery. Likewise, containers launched
+ // by a previous agent with io switchboard server mode disabled will
+ // not have info structs created for them either. In both cases
+ // there is nothing to watch, so we return an unsatisfiable future.
+ if (!infos.contains(containerId)) {
+ return Future<ContainerLimitation>();
+ }
+
+ Future<Option<int>> status = infos[containerId]->status;
+
+ return status
+ .then([](const Option<int>& status) {
+ if (status.isNone()) {
+ return Future<ContainerLimitation>();
+ }
+
+ if (status.isSome() &&
+ WIFEXITED(status.get()) &&
+ WEXITSTATUS(status.get()) == 0) {
+ return Future<ContainerLimitation>();
+ }
+
+ ContainerLimitation limitation;
+ limitation.set_reason(TaskStatus::REASON_IO_SWITCHBOARD_EXITED);
+
+ if (WIFEXITED(status.get())) {
+ limitation.set_message(
+ "'IOSwitchboard' exited with status:"
+ " " + stringify(WEXITSTATUS(status.get())));
+ } else if (WIFSIGNALED(status.get())) {
+ limitation.set_message(
+ "'IOSwitchboard' exited with signal:"
+ " " + stringify(strsignal(WTERMSIG(status.get()))));
+ }
+
+ return Future<ContainerLimitation>(limitation);
+ });
+#endif // __WINDOWS__
+}
+
+
Future<Nothing> IOSwitchboard::cleanup(
const ContainerID& containerId)
{
http://git-wip-us.apache.org/repos/asf/mesos/blob/052abf15/src/slave/containerizer/mesos/io/switchboard.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/io/switchboard.hpp b/src/slave/containerizer/mesos/io/switchboard.hpp
index ca0eee6..a3f19fc 100644
--- a/src/slave/containerizer/mesos/io/switchboard.hpp
+++ b/src/slave/containerizer/mesos/io/switchboard.hpp
@@ -26,6 +26,7 @@
#include <stout/try.hpp>
+#include <mesos/slave/containerizer.hpp>
#include <mesos/slave/container_logger.hpp>
#include "slave/flags.hpp"
@@ -68,6 +69,9 @@ public:
const ContainerID& containerId,
const mesos::slave::ContainerConfig& containerConfig);
+ virtual process::Future<mesos::slave::ContainerLimitation> watch(
+ const ContainerID& containerId);
+
virtual process::Future<Nothing> cleanup(
const ContainerID& containerId);
http://git-wip-us.apache.org/repos/asf/mesos/blob/052abf15/src/tests/containerizer/io_switchboard_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/io_switchboard_tests.cpp b/src/tests/containerizer/io_switchboard_tests.cpp
index c8fe876..d98e4f4 100644
--- a/src/tests/containerizer/io_switchboard_tests.cpp
+++ b/src/tests/containerizer/io_switchboard_tests.cpp
@@ -35,6 +35,8 @@
#include "common/http.hpp"
#include "common/recordio.hpp"
+#include "slave/containerizer/mesos/paths.hpp"
+
#include "slave/containerizer/mesos/io/switchboard.hpp"
#include "tests/environment.hpp"
@@ -46,6 +48,8 @@ namespace http = process::http;
namespace unix = process::network::unix;
#endif // __WINDOWS__
+namespace paths = mesos::internal::slave::containerizer::paths;
+
using mesos::agent::Call;
using mesos::agent::ProcessIO;
@@ -508,6 +512,74 @@ TEST_F(IOSwitchboardTest, OutputRedirectionWithTTY)
EXPECT_SOME_EQ("HelloWorld", os::read(path::join(directory.get(), "stdout")));
}
+
+// This test verifies that a container will be
+// destroyed if its io switchboard exits unexpectedly.
+TEST_F(IOSwitchboardTest, KillSwitchboardContainerDestroyed)
+{
+ slave::Flags flags = CreateSlaveFlags();
+ flags.launcher = "posix";
+ flags.isolation = "posix/cpu";
+ flags.io_switchboard_enable_server = true;
+
+ Fetcher fetcher;
+
+ Try<MesosContainerizer*> create = MesosContainerizer::create(
+ flags,
+ false,
+ &fetcher);
+
+ ASSERT_SOME(create);
+
+ Owned<MesosContainerizer> containerizer(create.get());
+
+ SlaveState state;
+ state.id = SlaveID();
+
+ AWAIT_READY(containerizer->recover(state));
+
+ ContainerID containerId;
+ containerId.set_value(UUID::random().toString());
+
+ Try<string> directory = environment->mkdtemp();
+ ASSERT_SOME(directory);
+
+ ExecutorInfo executorInfo = createExecutorInfo(
+ "executor",
+ "sleep 1000",
+ "cpus:1");
+
+ Future<bool> launch = containerizer->launch(
+ containerId,
+ None(),
+ executorInfo,
+ directory.get(),
+ None(),
+ SlaveID(),
+ map<string, string>(),
+ true); // TODO(benh): Ever want to test not checkpointing?
+
+ AWAIT_ASSERT_TRUE(launch);
+
+ Result<pid_t> pid = paths::getContainerIOSwitchboardPid(
+ flags.runtime_dir, containerId);
+
+ ASSERT_SOME(pid);
+
+ ASSERT_EQ(0, os::kill(pid.get(), SIGKILL));
+
+ Future<Option<ContainerTermination>> wait = containerizer->wait(containerId);
+
+ AWAIT_READY(wait);
+ ASSERT_SOME(wait.get());
+
+ ASSERT_TRUE(wait.get()->has_status());
+ EXPECT_WTERMSIG_EQ(SIGKILL, wait.get()->status());
+
+ ASSERT_TRUE(wait.get()->reasons().size() == 1);
+ ASSERT_EQ(TaskStatus::REASON_IO_SWITCHBOARD_EXITED,
+ wait.get()->reasons().Get(0));
+}
#endif // __WINDOWS__
} // namespace tests {
[5/6] mesos git commit: Added test for IOSwitchboard `recovery()`.
Posted by ji...@apache.org.
Added test for IOSwitchboard `recovery()`.
Review: https://reviews.apache.org/r/54455/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/575e1964
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/575e1964
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/575e1964
Branch: refs/heads/master
Commit: 575e19646b30f0286a573af399cbb087c7cd44e5
Parents: 052abf1
Author: Kevin Klues <kl...@gmail.com>
Authored: Tue Dec 6 15:21:53 2016 -0800
Committer: Jie Yu <yu...@gmail.com>
Committed: Tue Dec 6 15:21:53 2016 -0800
----------------------------------------------------------------------
.../containerizer/io_switchboard_tests.cpp | 128 +++++++++++++++++++
1 file changed, 128 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/575e1964/src/tests/containerizer/io_switchboard_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/io_switchboard_tests.cpp b/src/tests/containerizer/io_switchboard_tests.cpp
index d98e4f4..851a101 100644
--- a/src/tests/containerizer/io_switchboard_tests.cpp
+++ b/src/tests/containerizer/io_switchboard_tests.cpp
@@ -32,9 +32,13 @@
#include <mesos/agent/agent.hpp>
+#include <mesos/master/detector.hpp>
+
#include "common/http.hpp"
#include "common/recordio.hpp"
+#include "messages/messages.hpp"
+
#include "slave/containerizer/mesos/paths.hpp"
#include "slave/containerizer/mesos/io/switchboard.hpp"
@@ -59,11 +63,15 @@ using mesos::internal::slave::MesosContainerizer;
using mesos::internal::slave::state::SlaveState;
+using mesos::master::detector::MasterDetector;
+
using mesos::slave::ContainerTermination;
using process::Future;
using process::Owned;
+using testing::Eq;
+
using std::string;
namespace mesos {
@@ -580,6 +588,126 @@ TEST_F(IOSwitchboardTest, KillSwitchboardContainerDestroyed)
ASSERT_EQ(TaskStatus::REASON_IO_SWITCHBOARD_EXITED,
wait.get()->reasons().Get(0));
}
+
+
+// This test verifies that the io switchboard isolator recovers properly.
+TEST_F(IOSwitchboardTest, RecoverThenKillSwitchboardContainerDestroyed)
+{
+ Try<Owned<cluster::Master>> master = StartMaster();
+ ASSERT_SOME(master);
+
+ slave::Flags flags = CreateSlaveFlags();
+ flags.launcher = "posix";
+ flags.isolation = "posix/cpu";
+ flags.io_switchboard_enable_server = true;
+
+ Fetcher fetcher;
+
+ Owned<MasterDetector> detector = master.get()->createDetector();
+
+ Try<MesosContainerizer*> create = MesosContainerizer::create(
+ flags,
+ false,
+ &fetcher);
+
+ ASSERT_SOME(create);
+
+ Owned<MesosContainerizer> containerizer(create.get());
+
+ Try<Owned<cluster::Slave>> slave = StartSlave(
+ detector.get(),
+ containerizer.get(),
+ flags);
+
+ ASSERT_SOME(slave);
+
+ MockScheduler sched;
+
+ // Enable checkpointing for the framework.
+ FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+ frameworkInfo.set_checkpoint(true);
+
+ MesosSchedulerDriver driver(
+ &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
+
+ EXPECT_CALL(sched, registered(_, _, _));
+
+ Future<vector<Offer>> offers;
+ EXPECT_CALL(sched, resourceOffers(_, _))
+ .WillOnce(FutureArg<1>(&offers))
+ .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+ driver.start();
+
+ AWAIT_READY(offers);
+ EXPECT_NE(0u, offers.get().size());
+
+ // Launch a task.
+ TaskInfo task = createTask(offers.get()[0], "sleep 1000");
+
+ // Drop the status update from the slave to the master so the
+ // scheduler never recieves the first task update.
+ Future<StatusUpdateMessage> update =
+ DROP_PROTOBUF(StatusUpdateMessage(), slave.get()->pid, master.get()->pid);
+
+ driver.launchTasks(offers.get()[0].id(), {task});
+
+ AWAIT_READY(update);
+
+ // Restart the slave with a new containerizer.
+ slave.get()->terminate();
+
+ create = MesosContainerizer::create(
+ flags,
+ false,
+ &fetcher);
+
+ ASSERT_SOME(create);
+
+ containerizer.reset(create.get());
+
+ // Expect three task updates.
+ // (1) TASK_RUNNING before recovery.
+ // (2) TASK_RUNNING after recovery.
+ // (3) TASK_FAILED after the io switchboard is killed.
+ Future<TaskStatus> statusRunning;
+ Future<TaskStatus> statusFailed;
+ EXPECT_CALL(sched, statusUpdate(_, _))
+ .WillOnce(FutureArg<1>(&statusRunning))
+ .WillOnce(FutureArg<1>(&statusRunning))
+ .WillOnce(FutureArg<1>(&statusFailed))
+ .WillRepeatedly(Return()); // Ignore subsequent updates.
+
+ slave = StartSlave(detector.get(), containerizer.get(), flags);
+ ASSERT_SOME(slave);
+
+ // Make sure the task comes back as running.
+ AWAIT_READY(statusRunning);
+ EXPECT_EQ(TASK_RUNNING, statusRunning->state());
+
+ // Kill the io switchboard for the task.
+ Future<hashset<ContainerID>> containers = containerizer.get()->containers();
+ AWAIT_READY(containers);
+ EXPECT_EQ(1u, containers.get().size());
+
+ Result<pid_t> pid = paths::getContainerIOSwitchboardPid(
+ flags.runtime_dir, *containers->begin());
+
+ ASSERT_SOME(pid);
+
+ ASSERT_EQ(0, os::kill(pid.get(), SIGKILL));
+
+ // Make sure the task is killed and its
+ // reason is an IO switchboard failure.
+ AWAIT_READY(statusFailed);
+ EXPECT_EQ(TASK_FAILED, statusFailed->state());
+
+ ASSERT_TRUE(statusFailed->has_reason());
+ EXPECT_EQ(TaskStatus::REASON_IO_SWITCHBOARD_EXITED, statusFailed->reason());
+
+ driver.stop();
+ driver.join();
+}
#endif // __WINDOWS__
} // namespace tests {