You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ji...@apache.org on 2017/08/07 23:02:30 UTC

[3/4] mesos git commit: Used clone instead of fork in ns::clone.

Used clone instead of fork in ns::clone.

This patch is a workaround known to a glibc bug:
https://sourceware.org/bugzilla/show_bug.cgi?id=15392

Glibc version < 2.25 has an assertion in 'fork()' which checks if the
child process's pid is not the same as the parent. This invariant is
no longer true with pid namespaces being introduced. See more details
in MESOS-7858.

This patch uses 'clone()' instead 'fork()' to bypass this assertion
failure. Since we always uses async signal safe functions after the
clone and we don't leverage any 'at_fork' handlers, it's OK to use
'clone()' here to replace 'fork()'.

Review: https://reviews.apache.org/r/61476


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/7ad8d387
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/7ad8d387
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/7ad8d387

Branch: refs/heads/master
Commit: 7ad8d38725e0dc917f0fa3f9afc5263ec4981d3e
Parents: eb06f4d
Author: Jie Yu <yu...@gmail.com>
Authored: Mon Aug 7 11:59:37 2017 -0700
Committer: Jie Yu <yu...@gmail.com>
Committed: Mon Aug 7 16:02:24 2017 -0700

----------------------------------------------------------------------
 src/linux/ns.hpp | 83 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 49 insertions(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/7ad8d387/src/linux/ns.hpp
----------------------------------------------------------------------
diff --git a/src/linux/ns.hpp b/src/linux/ns.hpp
index 69e52fa..c5aeb97 100644
--- a/src/linux/ns.hpp
+++ b/src/linux/ns.hpp
@@ -532,12 +532,60 @@ inline Try<pid_t> clone(
 
     close(fds.values());
 
+    auto grandchildMain = [=]() -> int {
+      // Grandchild (second child, now completely entered in the
+      // namespaces of the target).
+      //
+      // Now clone with the specified flags, close the unused socket,
+      // and execute the specified function.
+      pid_t pid = os::clone([=]() {
+        // Now send back the pid and have it be translated appropriately
+        // by the kernel to the enclosing pid namespace.
+        //
+        // NOTE: sending back the pid is best effort because we're going
+        // to exit no matter what.
+        ((struct ucred*) CMSG_DATA(CMSG_FIRSTHDR(&message)))->pid = ::getpid();
+        ((struct ucred*) CMSG_DATA(CMSG_FIRSTHDR(&message)))->uid = ::getuid();
+        ((struct ucred*) CMSG_DATA(CMSG_FIRSTHDR(&message)))->gid = ::getgid();
+
+        if (sendmsg(sockets[1], &message, 0) == -1) {
+          // Failed to send the pid back to the parent!
+          _exit(EXIT_FAILURE);
+        }
+
+        ::close(sockets[1]);
+
+        return f();
+      },
+      flags,
+      stack.get());
+
+      ::close(sockets[1]);
+
+      // TODO(benh): Kill ourselves with an exit status that we can
+      // decode above to determine why `clone` failed.
+      _exit(pid < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
+      UNREACHABLE();
+    };
+
     // Fork again to make sure we're actually in those namespaces
     // (required for the pid namespace at least).
     //
+    // NOTE: We use clone instead of fork here because of a glibc bug.
+    // glibc version < 2.25 has an assertion in 'fork()' which checks
+    // if the child process's pid is not the same as the parent. This
+    // invariant is no longer true with pid namespaces being
+    // introduced. See more details in MESOS-7858.
+    //
+    // NOTE: glibc 'fork()' also specifies 'CLONE_CHILD_SETTID' and
+    // 'CLONE_CHILD_CLEARTID' for the clone flags. However, since we
+    // are not using any pthread library in the grandchild, we don't
+    // need those flags.
+    //
     // TODO(benh): Don't do a fork if we're not actually entering the
     // PID namespace since the extra fork is unnecessary.
-    pid_t grandchild = fork();
+    pid_t grandchild = os::clone(grandchildMain, SIGCHLD);
+
     if (grandchild < 0) {
       // TODO(benh): Exit with `errno` in order to capture `fork` error?
       ::close(sockets[1]);
@@ -574,39 +622,6 @@ inline Try<pid_t> clone(
       assert(WIFSIGNALED(status));
       raise(WTERMSIG(status));
     }
-
-    // Grandchild (second child, now completely entered in the
-    // namespaces of the target).
-    //
-    // Now clone with the specified flags, close the unused socket,
-    // and execute the specified function.
-    pid_t pid = os::clone([=]() {
-      // Now send back the pid and have it be translated appropriately
-      // by the kernel to the enclosing pid namespace.
-      //
-      // NOTE: sending back the pid is best effort because we're going
-      // to exit no matter what.
-      ((struct ucred*) CMSG_DATA(CMSG_FIRSTHDR(&message)))->pid = ::getpid();
-      ((struct ucred*) CMSG_DATA(CMSG_FIRSTHDR(&message)))->uid = ::getuid();
-      ((struct ucred*) CMSG_DATA(CMSG_FIRSTHDR(&message)))->gid = ::getgid();
-
-      if (sendmsg(sockets[1], &message, 0) == -1) {
-        // Failed to send the pid back to the parent!
-        _exit(EXIT_FAILURE);
-      }
-
-      ::close(sockets[1]);
-
-      return f();
-    },
-    flags,
-    stack.get());
-
-    ::close(sockets[1]);
-
-    // TODO(benh): Kill ourselves with an exit status that we can
-    // decode above to determine why `clone` failed.
-    _exit(pid < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
   }
   UNREACHABLE();
 }