You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by jp...@apache.org on 2017/11/15 20:29:27 UTC

[1/4] mesos git commit: Added MESOS-8159 to the 1.4.2 CHANGELOG.

Repository: mesos
Updated Branches:
  refs/heads/1.4.x cf4f4c0e8 -> dbc1414e4


Added MESOS-8159 to the 1.4.2 CHANGELOG.


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/dbc1414e
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/dbc1414e
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/dbc1414e

Branch: refs/heads/1.4.x
Commit: dbc1414e42c35f6e0b8e93435d0b0192de879359
Parents: e3a2d45
Author: James Peach <jp...@apache.org>
Authored: Wed Nov 15 11:32:00 2017 -0800
Committer: James Peach <jp...@apache.org>
Committed: Wed Nov 15 12:28:56 2017 -0800

----------------------------------------------------------------------
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/dbc1414e/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index ab8844b..c0ca159 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@ Release Notes - Mesos - Version 1.4.2 (WIP)
 
 ** Bug
  * [MESOS-7975] - The command/default/docker executor can incorrectly send a TASK_FINISHED update even when the task is killed.
+ * [MESOS-8159] - ns::clone uses an async signal unsafe stack.
 
 
 Release Notes - Mesos - Version 1.4.1


[2/4] mesos git commit: Changed `os:Stack` to allocate with `mmap`.

Posted by jp...@apache.org.
Changed `os:Stack` to allocate with `mmap`.

If `os:Stack` is allocated with `mmap`, this automatically
guarantees the correct alignment (since `mmap` is page-aligned
by definition).

Although `mmap` is not required to be async signal safe
(i.e. it might not be safe to call from a signal handler), in
practice it should be safe to call between `fork` and `exec`
since it is a raw system call that does not require any user
space locks to be held.

Review: https://reviews.apache.org/r/63677/
(cherry picked from commit 49f76864ec7f54f3fe7d1dabc7a199ef7157229a)


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/5915eb2a
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/5915eb2a
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/5915eb2a

Branch: refs/heads/1.4.x
Commit: 5915eb2a79bb408276964a95f62b0f81d8202763
Parents: 12682b1
Author: James Peach <jp...@apache.org>
Authored: Wed Nov 15 07:58:57 2017 -0800
Committer: James Peach <jp...@apache.org>
Committed: Wed Nov 15 12:28:56 2017 -0800

----------------------------------------------------------------------
 3rdparty/stout/include/stout/os/linux.hpp | 47 ++++++++++++++------------
 1 file changed, 25 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/5915eb2a/3rdparty/stout/include/stout/os/linux.hpp
----------------------------------------------------------------------
diff --git a/3rdparty/stout/include/stout/os/linux.hpp b/3rdparty/stout/include/stout/os/linux.hpp
index e4d9666..50352c9 100644
--- a/3rdparty/stout/include/stout/os/linux.hpp
+++ b/3rdparty/stout/include/stout/os/linux.hpp
@@ -18,6 +18,7 @@
 #error "stout/os/linux.hpp is only available on Linux systems."
 #endif // __linux__
 
+#include <sys/mman.h>
 #include <sys/types.h> // For pid_t.
 
 #include <list>
@@ -33,7 +34,6 @@
 #include <stout/result.hpp>
 #include <stout/try.hpp>
 
-#include <stout/os/pagesize.hpp>
 #include <stout/os/process.hpp>
 
 namespace os {
@@ -57,56 +57,59 @@ public:
   // 8 MiB is the default for "ulimit -s" on OSX and Linux.
   static constexpr size_t DEFAULT_SIZE = 8 * 1024 * 1024;
 
-  // Allocate a stack.
+  // Allocate a stack. Note that this is NOT async signal safe, nor
+  // safe to call between fork and exec.
   static Try<Stack> create(size_t size)
   {
     Stack stack(size);
 
     if (!stack.allocate()) {
-      return ErrnoError("Failed to allocate and align stack");
+      return ErrnoError();
     }
 
     return stack;
   }
 
+  explicit Stack(size_t size_) : size(size_) {}
+
+  // Allocate the stack using mmap. We avoid malloc because we want
+  // this to be safe to use between fork and exec where malloc might
+  // deadlock. Returns false and sets `errno` on failure.
   bool allocate()
   {
-    // Allocate and align the memory to 16 bytes.
-    // x86, x64, and AArch64/ARM64 all expect a 16 byte aligned stack.
-    // ARM64/aarch64 enforces the alignment where x86/x64 does not.
-    // Without this alignment Mesos will crash with a SIGBUS on ARM64/aarch64.
-    int err = ::posix_memalign(
-                reinterpret_cast<void**>(&address),
-                os::pagesize(),
-                size);
-
-    if (err != 0) {
-        errno = err;
+    int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+#if defined(MAP_STACK)
+    flags |= MAP_STACK;
+#endif
+
+    address = ::mmap(nullptr, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+    if (address == MAP_FAILED) {
+      return false;
     }
 
-    return err == 0;
+    return true;
   }
 
   // Explicitly free the stack.
   // The destructor won't free the allocated stack.
   void deallocate()
   {
-    ::free(address);
-    address = nullptr;
-    size = 0;
+    PCHECK(::munmap(address, size) == 0);
+    address = MAP_FAILED;
   }
 
   // Stack grows down, return the first usable address.
   char* start() const
   {
-    return address + size;
+    return address == MAP_FAILED
+      ? nullptr
+      : (static_cast<char*>(address) + size);
   }
 
-  explicit Stack(size_t size_) : size(size_) {}
-
 private:
   size_t size;
-  char* address;
+  void* address = MAP_FAILED;
 };
 
 


[4/4] mesos git commit: Added a non-allocating variant of `os::clone`.

Posted by jp...@apache.org.
Added a non-allocating variant of `os::clone`.

Added a non-allocating variant of `os::clone` in the
`os::signal_safe` namespace. This encapsulates the lambda
trampoline, but requires the caller to bring their own stack
since we can't know when it is safe to implicitly allocate
a stack.

Now that we have `os::signal_safe::clone`, there is no need for
`os::clone` to take an optional `os::Stack` argument. We can
remove that and simplify the cleanup logic.

Review: https://reviews.apache.org/r/63675/
(cherry picked from commit ec91293091be64919ca0e7f0f535fc98da397bcf)


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/12682b18
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/12682b18
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/12682b18

Branch: refs/heads/1.4.x
Commit: 12682b1894417cf5e1488ba293613004581759da
Parents: cf4f4c0
Author: James Peach <jp...@apache.org>
Authored: Wed Nov 15 07:58:53 2017 -0800
Committer: James Peach <jp...@apache.org>
Committed: Wed Nov 15 12:28:56 2017 -0800

----------------------------------------------------------------------
 3rdparty/stout/include/stout/os/linux.hpp | 63 ++++++++++++++++++--------
 1 file changed, 43 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/12682b18/3rdparty/stout/include/stout/os/linux.hpp
----------------------------------------------------------------------
diff --git a/3rdparty/stout/include/stout/os/linux.hpp b/3rdparty/stout/include/stout/os/linux.hpp
index 68b4090..e4d9666 100644
--- a/3rdparty/stout/include/stout/os/linux.hpp
+++ b/3rdparty/stout/include/stout/os/linux.hpp
@@ -62,19 +62,29 @@ public:
   {
     Stack stack(size);
 
+    if (!stack.allocate()) {
+      return ErrnoError("Failed to allocate and align stack");
+    }
+
+    return stack;
+  }
+
+  bool allocate()
+  {
     // Allocate and align the memory to 16 bytes.
     // x86, x64, and AArch64/ARM64 all expect a 16 byte aligned stack.
     // ARM64/aarch64 enforces the alignment where x86/x64 does not.
     // Without this alignment Mesos will crash with a SIGBUS on ARM64/aarch64.
     int err = ::posix_memalign(
-                reinterpret_cast<void**>(&stack.address),
+                reinterpret_cast<void**>(&address),
                 os::pagesize(),
-                stack.size);
-    if (err) {
-      return ErrnoError("Failed to allocate and align stack");
+                size);
+
+    if (err != 0) {
+        errno = err;
     }
 
-    return stack;
+    return err == 0;
   }
 
   // Explicitly free the stack.
@@ -87,23 +97,36 @@ public:
   }
 
   // Stack grows down, return the first usable address.
-  char* start()
+  char* start() const
   {
     return address + size;
   }
 
-private:
   explicit Stack(size_t size_) : size(size_) {}
 
+private:
   size_t size;
   char* address;
 };
 
 
+namespace signal_safe {
+
+
 inline pid_t clone(
-    const lambda::function<int()>& func,
+    const Stack& stack,
     int flags,
-    Option<Stack> stack = None())
+    const lambda::function<int()>& func)
+{
+  return ::clone(childMain, stack.start(), flags, (void*) &func);
+}
+
+} // namespace signal_safe {
+
+
+inline pid_t clone(
+    const lambda::function<int()>& func,
+    int flags)
 {
   // Stack for the child.
   //
@@ -111,18 +134,16 @@ inline pid_t clone(
   // glibc's 'clone' will modify the stack passed to it, therefore the
   // stack must NOT be shared as multiple 'clone's can be invoked
   // simultaneously.
+  Stack stack(Stack::DEFAULT_SIZE);
 
-  bool cleanup = false;
-  if (stack.isNone()) {
-    Try<Stack> _stack = Stack::create(Stack::DEFAULT_SIZE);
-    if (_stack.isError()) {
-        return -1;
-    }
-    stack = _stack.get();
-    cleanup = true;
+  if (!stack.allocate()) {
+    // TODO(jpeach): In MESOS-8155, we will return an
+    // ErrnoError() here, but for now keep the interface
+    // compatible.
+    return -1;
   }
 
-  pid_t pid = ::clone(childMain, stack->start(), flags, (void*) &func);
+  pid_t pid = signal_safe::clone(stack, flags, func);
 
   // Given we allocated the stack ourselves, there are two
   // circumstances where we need to delete the allocated stack to
@@ -135,8 +156,10 @@ inline pid_t clone(
   //     calling process. If CLONE_VM is set ::clone will create a
   //     thread which runs in the same memory space with the calling
   //     process, in which case we don't want to call delete!
-  if (cleanup && (pid < 0 || !(flags & CLONE_VM))) {
-    stack->deallocate();
+  //
+  // TODO(jpeach): In case (2) we will leak the stack memory.
+  if (pid < 0 || !(flags & CLONE_VM)) {
+    stack.deallocate();
   }
 
   return pid;


[3/4] mesos git commit: Improved the signal safety of `ns::clone`.

Posted by jp...@apache.org.
Improved the signal safety of `ns::clone`.

`ns::clone` was implicitly using malloc to allocate a stack
after fork but before exec, where we should be avoiding
`malloc`.

Updated the second clone to use `os::signal_safe::clone`. We
still explicitly allocate a stack after the fork, but since
that is allocated with `mmap` is it safe in this particular
case.

Review: https://reviews.apache.org/r/63678/
(cherry picked from commit f1c861de652ad1dccff67e1b0d1ba97d52f6b00e)


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/e3a2d458
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/e3a2d458
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/e3a2d458

Branch: refs/heads/1.4.x
Commit: e3a2d458183ad15ece9c3e1ad0acdc992f8042f1
Parents: 5915eb2
Author: James Peach <jp...@apache.org>
Authored: Wed Nov 15 11:21:54 2017 -0800
Committer: James Peach <jp...@apache.org>
Committed: Wed Nov 15 12:28:56 2017 -0800

----------------------------------------------------------------------
 src/linux/ns.hpp | 64 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/e3a2d458/src/linux/ns.hpp
----------------------------------------------------------------------
diff --git a/src/linux/ns.hpp b/src/linux/ns.hpp
index e961163..e642113 100644
--- a/src/linux/ns.hpp
+++ b/src/linux/ns.hpp
@@ -322,6 +322,12 @@ inline Try<pid_t> clone(
     {CLONE_NEWNS, "mnt"}
   };
 
+  // Since we assume below that the parent can deallocate the stack
+  // after cloning the children, the caller must not pass CLONE_VM.
+  // That would cause the both processes to share their address space
+  // so deallocating the stack in the parent would affect the child.
+  CHECK_EQ(0, flags & CLONE_VM);
+
   // Support for user namespaces in all filesystems is incomplete
   // until version 3.12 (see 'Availability' in man page of
   // 'user_namespaces'), so for now we don't support entering them.
@@ -542,27 +548,31 @@ inline Try<pid_t> clone(
       //
       // Now clone with the specified flags, close the unused socket,
       // and execute the specified function.
-      pid_t pid = os::clone([=]() {
-        // Now send back the pid and have it be translated appropriately
-        // by the kernel to the enclosing pid namespace.
-        //
-        // NOTE: sending back the pid is best effort because we're going
-        // to exit no matter what.
-        ((struct ucred*) CMSG_DATA(CMSG_FIRSTHDR(&message)))->pid = ::getpid();
-        ((struct ucred*) CMSG_DATA(CMSG_FIRSTHDR(&message)))->uid = ::getuid();
-        ((struct ucred*) CMSG_DATA(CMSG_FIRSTHDR(&message)))->gid = ::getgid();
-
-        if (sendmsg(sockets[1], &message, 0) == -1) {
-          // Failed to send the pid back to the parent!
-          _exit(EXIT_FAILURE);
-        }
-
-        ::close(sockets[1]);
-
-        return f();
-      },
-      flags,
-      stack.get());
+      pid_t pid = os::signal_safe::clone(
+          stack.get(),
+          flags,
+          [=]() {
+            struct ucred* cred = reinterpret_cast<struct ucred*>(
+                CMSG_DATA(CMSG_FIRSTHDR(&message)));
+
+            // Now send back the pid and have it be translated appropriately
+            // by the kernel to the enclosing pid namespace.
+            //
+            // NOTE: sending back the pid is best effort because we're going
+            // to exit no matter what.
+            cred->pid = ::getpid();
+            cred->uid = ::getuid();
+            cred->gid = ::getgid();
+
+            if (sendmsg(sockets[1], &message, 0) == -1) {
+              // Failed to send the pid back to the parent!
+              _exit(EXIT_FAILURE);
+            }
+
+            ::close(sockets[1]);
+
+            return f();
+          });
 
       ::close(sockets[1]);
 
@@ -572,6 +582,13 @@ inline Try<pid_t> clone(
       UNREACHABLE();
     };
 
+    os::Stack grandchildStack(os::Stack::DEFAULT_SIZE);
+
+    if (!grandchildStack.allocate()) {
+      ::close(sockets[1]);
+      _exit(EXIT_FAILURE);
+    }
+
     // Fork again to make sure we're actually in those namespaces
     // (required for the pid namespace at least).
     //
@@ -588,7 +605,10 @@ inline Try<pid_t> clone(
     //
     // TODO(benh): Don't do a fork if we're not actually entering the
     // PID namespace since the extra fork is unnecessary.
-    pid_t grandchild = os::clone(grandchildMain, SIGCHLD);
+    pid_t grandchild =
+      os::signal_safe::clone(grandchildStack, SIGCHLD, grandchildMain);
+
+    grandchildStack.deallocate();
 
     if (grandchild < 0) {
       // TODO(benh): Exit with `errno` in order to capture `fork` error?