You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by jp...@apache.org on 2017/11/15 20:29:27 UTC
[1/4] mesos git commit: Added MESOS-8159 to the 1.4.2 CHANGELOG.
Repository: mesos
Updated Branches:
refs/heads/1.4.x cf4f4c0e8 -> dbc1414e4
Added MESOS-8159 to the 1.4.2 CHANGELOG.
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/dbc1414e
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/dbc1414e
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/dbc1414e
Branch: refs/heads/1.4.x
Commit: dbc1414e42c35f6e0b8e93435d0b0192de879359
Parents: e3a2d45
Author: James Peach <jp...@apache.org>
Authored: Wed Nov 15 11:32:00 2017 -0800
Committer: James Peach <jp...@apache.org>
Committed: Wed Nov 15 12:28:56 2017 -0800
----------------------------------------------------------------------
CHANGELOG | 1 +
1 file changed, 1 insertion(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/dbc1414e/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index ab8844b..c0ca159 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@ Release Notes - Mesos - Version 1.4.2 (WIP)
** Bug
* [MESOS-7975] - The command/default/docker executor can incorrectly send a TASK_FINISHED update even when the task is killed.
+ * [MESOS-8159] - ns::clone uses an async signal unsafe stack.
Release Notes - Mesos - Version 1.4.1
[2/4] mesos git commit: Changed `os:Stack` to allocate with `mmap`.
Posted by jp...@apache.org.
Changed `os:Stack` to allocate with `mmap`.
If `os:Stack` is allocated with `mmap`, this automatically
guarantees the correct alignment (since `mmap` is page-aligned
by definition).
Although `mmap` is not required to be async signal safe
(i.e. it might not be safe to call from a signal handler), in
practice it should be safe to call between `fork` and `exec`
since it is a raw system call that does not require any user
space locks to be held.
Review: https://reviews.apache.org/r/63677/
(cherry picked from commit 49f76864ec7f54f3fe7d1dabc7a199ef7157229a)
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/5915eb2a
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/5915eb2a
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/5915eb2a
Branch: refs/heads/1.4.x
Commit: 5915eb2a79bb408276964a95f62b0f81d8202763
Parents: 12682b1
Author: James Peach <jp...@apache.org>
Authored: Wed Nov 15 07:58:57 2017 -0800
Committer: James Peach <jp...@apache.org>
Committed: Wed Nov 15 12:28:56 2017 -0800
----------------------------------------------------------------------
3rdparty/stout/include/stout/os/linux.hpp | 47 ++++++++++++++------------
1 file changed, 25 insertions(+), 22 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/5915eb2a/3rdparty/stout/include/stout/os/linux.hpp
----------------------------------------------------------------------
diff --git a/3rdparty/stout/include/stout/os/linux.hpp b/3rdparty/stout/include/stout/os/linux.hpp
index e4d9666..50352c9 100644
--- a/3rdparty/stout/include/stout/os/linux.hpp
+++ b/3rdparty/stout/include/stout/os/linux.hpp
@@ -18,6 +18,7 @@
#error "stout/os/linux.hpp is only available on Linux systems."
#endif // __linux__
+#include <sys/mman.h>
#include <sys/types.h> // For pid_t.
#include <list>
@@ -33,7 +34,6 @@
#include <stout/result.hpp>
#include <stout/try.hpp>
-#include <stout/os/pagesize.hpp>
#include <stout/os/process.hpp>
namespace os {
@@ -57,56 +57,59 @@ public:
// 8 MiB is the default for "ulimit -s" on OSX and Linux.
static constexpr size_t DEFAULT_SIZE = 8 * 1024 * 1024;
- // Allocate a stack.
+ // Allocate a stack. Note that this is NOT async signal safe, nor
+ // safe to call between fork and exec.
static Try<Stack> create(size_t size)
{
Stack stack(size);
if (!stack.allocate()) {
- return ErrnoError("Failed to allocate and align stack");
+ return ErrnoError();
}
return stack;
}
+ explicit Stack(size_t size_) : size(size_) {}
+
+ // Allocate the stack using mmap. We avoid malloc because we want
+ // this to be safe to use between fork and exec where malloc might
+ // deadlock. Returns false and sets `errno` on failure.
bool allocate()
{
- // Allocate and align the memory to 16 bytes.
- // x86, x64, and AArch64/ARM64 all expect a 16 byte aligned stack.
- // ARM64/aarch64 enforces the alignment where x86/x64 does not.
- // Without this alignment Mesos will crash with a SIGBUS on ARM64/aarch64.
- int err = ::posix_memalign(
- reinterpret_cast<void**>(&address),
- os::pagesize(),
- size);
-
- if (err != 0) {
- errno = err;
+ int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+#if defined(MAP_STACK)
+ flags |= MAP_STACK;
+#endif
+
+ address = ::mmap(nullptr, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+ if (address == MAP_FAILED) {
+ return false;
}
- return err == 0;
+ return true;
}
// Explicitly free the stack.
// The destructor won't free the allocated stack.
void deallocate()
{
- ::free(address);
- address = nullptr;
- size = 0;
+ PCHECK(::munmap(address, size) == 0);
+ address = MAP_FAILED;
}
// Stack grows down, return the first usable address.
char* start() const
{
- return address + size;
+ return address == MAP_FAILED
+ ? nullptr
+ : (static_cast<char*>(address) + size);
}
- explicit Stack(size_t size_) : size(size_) {}
-
private:
size_t size;
- char* address;
+ void* address = MAP_FAILED;
};
[4/4] mesos git commit: Added a non-allocating variant of `os::clone`.
Posted by jp...@apache.org.
Added a non-allocating variant of `os::clone`.
Added a non-allocating variant of `os::clone` in the
`os::signal_safe` namespace. This encapsulates the lambda
trampoline, but requires the caller to bring their own stack
since we can't know when it is safe to implicitly allocate
a stack.
Now that we have `os::signal_safe::clone`, there is no need for
`os::clone` to take an optional `os::Stack` argument. We can
remove that and simplify the cleanup logic.
Review: https://reviews.apache.org/r/63675/
(cherry picked from commit ec91293091be64919ca0e7f0f535fc98da397bcf)
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/12682b18
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/12682b18
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/12682b18
Branch: refs/heads/1.4.x
Commit: 12682b1894417cf5e1488ba293613004581759da
Parents: cf4f4c0
Author: James Peach <jp...@apache.org>
Authored: Wed Nov 15 07:58:53 2017 -0800
Committer: James Peach <jp...@apache.org>
Committed: Wed Nov 15 12:28:56 2017 -0800
----------------------------------------------------------------------
3rdparty/stout/include/stout/os/linux.hpp | 63 ++++++++++++++++++--------
1 file changed, 43 insertions(+), 20 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/12682b18/3rdparty/stout/include/stout/os/linux.hpp
----------------------------------------------------------------------
diff --git a/3rdparty/stout/include/stout/os/linux.hpp b/3rdparty/stout/include/stout/os/linux.hpp
index 68b4090..e4d9666 100644
--- a/3rdparty/stout/include/stout/os/linux.hpp
+++ b/3rdparty/stout/include/stout/os/linux.hpp
@@ -62,19 +62,29 @@ public:
{
Stack stack(size);
+ if (!stack.allocate()) {
+ return ErrnoError("Failed to allocate and align stack");
+ }
+
+ return stack;
+ }
+
+ bool allocate()
+ {
// Allocate and align the memory to 16 bytes.
// x86, x64, and AArch64/ARM64 all expect a 16 byte aligned stack.
// ARM64/aarch64 enforces the alignment where x86/x64 does not.
// Without this alignment Mesos will crash with a SIGBUS on ARM64/aarch64.
int err = ::posix_memalign(
- reinterpret_cast<void**>(&stack.address),
+ reinterpret_cast<void**>(&address),
os::pagesize(),
- stack.size);
- if (err) {
- return ErrnoError("Failed to allocate and align stack");
+ size);
+
+ if (err != 0) {
+ errno = err;
}
- return stack;
+ return err == 0;
}
// Explicitly free the stack.
@@ -87,23 +97,36 @@ public:
}
// Stack grows down, return the first usable address.
- char* start()
+ char* start() const
{
return address + size;
}
-private:
explicit Stack(size_t size_) : size(size_) {}
+private:
size_t size;
char* address;
};
+namespace signal_safe {
+
+
inline pid_t clone(
- const lambda::function<int()>& func,
+ const Stack& stack,
int flags,
- Option<Stack> stack = None())
+ const lambda::function<int()>& func)
+{
+ return ::clone(childMain, stack.start(), flags, (void*) &func);
+}
+
+} // namespace signal_safe {
+
+
+inline pid_t clone(
+ const lambda::function<int()>& func,
+ int flags)
{
// Stack for the child.
//
@@ -111,18 +134,16 @@ inline pid_t clone(
// glibc's 'clone' will modify the stack passed to it, therefore the
// stack must NOT be shared as multiple 'clone's can be invoked
// simultaneously.
+ Stack stack(Stack::DEFAULT_SIZE);
- bool cleanup = false;
- if (stack.isNone()) {
- Try<Stack> _stack = Stack::create(Stack::DEFAULT_SIZE);
- if (_stack.isError()) {
- return -1;
- }
- stack = _stack.get();
- cleanup = true;
+ if (!stack.allocate()) {
+ // TODO(jpeach): In MESOS-8155, we will return an
+ // ErrnoError() here, but for now keep the interface
+ // compatible.
+ return -1;
}
- pid_t pid = ::clone(childMain, stack->start(), flags, (void*) &func);
+ pid_t pid = signal_safe::clone(stack, flags, func);
// Given we allocated the stack ourselves, there are two
// circumstances where we need to delete the allocated stack to
@@ -135,8 +156,10 @@ inline pid_t clone(
// calling process. If CLONE_VM is set ::clone will create a
// thread which runs in the same memory space with the calling
// process, in which case we don't want to call delete!
- if (cleanup && (pid < 0 || !(flags & CLONE_VM))) {
- stack->deallocate();
+ //
+ // TODO(jpeach): In case (2) we will leak the stack memory.
+ if (pid < 0 || !(flags & CLONE_VM)) {
+ stack.deallocate();
}
return pid;
[3/4] mesos git commit: Improved the signal safety of `ns::clone`.
Posted by jp...@apache.org.
Improved the signal safety of `ns::clone`.
`ns::clone` was implicitly using malloc to allocate a stack
after fork but before exec, where we should be avoiding
`malloc`.
Updated the second clone to use `os::signal_safe::clone`. We
still explicitly allocate a stack after the fork, but since
that is allocated with `mmap` is it safe in this particular
case.
Review: https://reviews.apache.org/r/63678/
(cherry picked from commit f1c861de652ad1dccff67e1b0d1ba97d52f6b00e)
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/e3a2d458
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/e3a2d458
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/e3a2d458
Branch: refs/heads/1.4.x
Commit: e3a2d458183ad15ece9c3e1ad0acdc992f8042f1
Parents: 5915eb2
Author: James Peach <jp...@apache.org>
Authored: Wed Nov 15 11:21:54 2017 -0800
Committer: James Peach <jp...@apache.org>
Committed: Wed Nov 15 12:28:56 2017 -0800
----------------------------------------------------------------------
src/linux/ns.hpp | 64 +++++++++++++++++++++++++++++++++------------------
1 file changed, 42 insertions(+), 22 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/e3a2d458/src/linux/ns.hpp
----------------------------------------------------------------------
diff --git a/src/linux/ns.hpp b/src/linux/ns.hpp
index e961163..e642113 100644
--- a/src/linux/ns.hpp
+++ b/src/linux/ns.hpp
@@ -322,6 +322,12 @@ inline Try<pid_t> clone(
{CLONE_NEWNS, "mnt"}
};
+ // Since we assume below that the parent can deallocate the stack
+ // after cloning the children, the caller must not pass CLONE_VM.
+ // That would cause the both processes to share their address space
+ // so deallocating the stack in the parent would affect the child.
+ CHECK_EQ(0, flags & CLONE_VM);
+
// Support for user namespaces in all filesystems is incomplete
// until version 3.12 (see 'Availability' in man page of
// 'user_namespaces'), so for now we don't support entering them.
@@ -542,27 +548,31 @@ inline Try<pid_t> clone(
//
// Now clone with the specified flags, close the unused socket,
// and execute the specified function.
- pid_t pid = os::clone([=]() {
- // Now send back the pid and have it be translated appropriately
- // by the kernel to the enclosing pid namespace.
- //
- // NOTE: sending back the pid is best effort because we're going
- // to exit no matter what.
- ((struct ucred*) CMSG_DATA(CMSG_FIRSTHDR(&message)))->pid = ::getpid();
- ((struct ucred*) CMSG_DATA(CMSG_FIRSTHDR(&message)))->uid = ::getuid();
- ((struct ucred*) CMSG_DATA(CMSG_FIRSTHDR(&message)))->gid = ::getgid();
-
- if (sendmsg(sockets[1], &message, 0) == -1) {
- // Failed to send the pid back to the parent!
- _exit(EXIT_FAILURE);
- }
-
- ::close(sockets[1]);
-
- return f();
- },
- flags,
- stack.get());
+ pid_t pid = os::signal_safe::clone(
+ stack.get(),
+ flags,
+ [=]() {
+ struct ucred* cred = reinterpret_cast<struct ucred*>(
+ CMSG_DATA(CMSG_FIRSTHDR(&message)));
+
+ // Now send back the pid and have it be translated appropriately
+ // by the kernel to the enclosing pid namespace.
+ //
+ // NOTE: sending back the pid is best effort because we're going
+ // to exit no matter what.
+ cred->pid = ::getpid();
+ cred->uid = ::getuid();
+ cred->gid = ::getgid();
+
+ if (sendmsg(sockets[1], &message, 0) == -1) {
+ // Failed to send the pid back to the parent!
+ _exit(EXIT_FAILURE);
+ }
+
+ ::close(sockets[1]);
+
+ return f();
+ });
::close(sockets[1]);
@@ -572,6 +582,13 @@ inline Try<pid_t> clone(
UNREACHABLE();
};
+ os::Stack grandchildStack(os::Stack::DEFAULT_SIZE);
+
+ if (!grandchildStack.allocate()) {
+ ::close(sockets[1]);
+ _exit(EXIT_FAILURE);
+ }
+
// Fork again to make sure we're actually in those namespaces
// (required for the pid namespace at least).
//
@@ -588,7 +605,10 @@ inline Try<pid_t> clone(
//
// TODO(benh): Don't do a fork if we're not actually entering the
// PID namespace since the extra fork is unnecessary.
- pid_t grandchild = os::clone(grandchildMain, SIGCHLD);
+ pid_t grandchild =
+ os::signal_safe::clone(grandchildStack, SIGCHLD, grandchildMain);
+
+ grandchildStack.deallocate();
if (grandchild < 0) {
// TODO(benh): Exit with `errno` in order to capture `fork` error?