You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ji...@apache.org on 2017/03/18 03:39:08 UTC

[2/2] mesos git commit: Ensured the cgroup memory+swap limit is no less than the memory limit.

Ensured the cgroup memory+swap limit is no less than the memory limit.

This addressed the bug we saw in MESOS-7237. Linux kernel will ensure
cgroup memory+swap limit (i.e., 'memory.memsw.limit_in_bytes') is always
no less than the memory limit ('memory.limit_in_bytes'). Prior to this
patch, the memory subsystem in the cgroups isolator is buggy because
memory+swap limit is always changed after the memory limit. This will
cause EINVAL if we increase the memory limit of the container.

Review: https://reviews.apache.org/r/57574


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/dc498b7c
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/dc498b7c
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/dc498b7c

Branch: refs/heads/1.2.x
Commit: dc498b7ca1f38dab5187d0553b147d0f56ad10bd
Parents: 801cdfa
Author: Jie Yu <yu...@gmail.com>
Authored: Mon Mar 13 16:58:03 2017 -0700
Committer: Jie Yu <yu...@gmail.com>
Committed: Fri Mar 17 20:38:48 2017 -0700

----------------------------------------------------------------------
 .../isolators/cgroups/subsystems/memory.cpp     | 84 +++++++++++++-------
 1 file changed, 56 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/dc498b7c/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
index 009e996..113e908 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
@@ -211,39 +211,16 @@ Future<Nothing> MemorySubsystem::update(
         ": " + currentLimit.error());
   }
 
-  // Determine whether to set the hard limit. We only update the hard
-  // limit if this is the first time or when we're raising the
-  // existing limit, then we can update the hard limit safely.
-  // Otherwise, if we need to decrease 'memory.limit_in_bytes' we may
-  // induce an OOM if too much memory is in use.  As a result, we only
-  // update the soft limit when the memory reservation is being
-  // reduced. This is probably okay if the machine has available
-  // resources.
-  //
-  // TODO(benh): Introduce a MemoryWatcherProcess which monitors the
-  // discrepancy between usage and soft limit and introduces a "manual
-  // oom" if necessary.
-  //
-  // If this is the first time, 'memory.limit_in_bytes' is the inital
-  // value which may be one of following possible values:
-  //   * LONG_MAX (Linux Kernel Version < 3.12)
-  //   * ULONG_MAX (3.12 <= Linux Kernel Version < 3.19)
-  //   * LONG_MAX / pageSize * pageSize (Linux Kernel Version >= 3.19)
-  // Thus, if 'currentLimit' is greater or equals to 'initialLimit'
-  // below, we know it's the first time.
-  static const size_t pageSize = os::pagesize();
-  Bytes initialLimit(static_cast<uint64_t>(LONG_MAX / pageSize * pageSize));
+  bool limitSwap = flags.cgroups_limit_swap;
 
-  if (currentLimit.get() >= initialLimit || limit > currentLimit.get()) {
-    // We always set limit_in_bytes first and optionally set
-    // memsw.limit_in_bytes if `cgroups_limit_swap` is true.
+  auto setLimitInBytes = [=]() -> Try<Nothing> {
     Try<Nothing> write = cgroups::memory::limit_in_bytes(
         hierarchy,
         cgroup,
         limit);
 
     if (write.isError()) {
-      return Failure(
+      return Error(
           "Failed to set 'memory.limit_in_bytes'"
           ": " + write.error());
     }
@@ -251,14 +228,18 @@ Future<Nothing> MemorySubsystem::update(
     LOG(INFO) << "Updated 'memory.limit_in_bytes' to " << limit
               << " for container " << containerId;
 
-    if (flags.cgroups_limit_swap) {
+    return Nothing();
+  };
+
+  auto setMemswLimitInBytes = [=]() -> Try<Nothing> {
+    if (limitSwap) {
       Try<bool> write = cgroups::memory::memsw_limit_in_bytes(
           hierarchy,
           cgroup,
           limit);
 
       if (write.isError()) {
-        return Failure(
+        return Error(
             "Failed to set 'memory.memsw.limit_in_bytes'"
             ": " + write.error());
       }
@@ -266,6 +247,53 @@ Future<Nothing> MemorySubsystem::update(
       LOG(INFO) << "Updated 'memory.memsw.limit_in_bytes' to " << limit
                 << " for container " << containerId;
     }
+
+    return Nothing();
+  };
+
+  vector<lambda::function<Try<Nothing>(void)>> setFunctions;
+
+  // Now, determine whether to set the hard limit. We only update the
+  // hard limit if this is the first time or when we're raising the
+  // existing limit, then we can update the hard limit safely.
+  // Otherwise, if we need to decrease 'memory.limit_in_bytes' we may
+  // induce an OOM if too much memory is in use. As a result, we only
+  // update the soft limit when the memory reservation is being
+  // reduced. This is probably okay if the machine has available
+  // resources.
+  //
+  // TODO(benh): Introduce a MemoryWatcherProcess which monitors the
+  // discrepancy between usage and soft limit and introduces a "manual
+  // oom" if necessary.
+  //
+  // If this is the first time, 'memory.limit_in_bytes' is unlimited
+  // which may be one of following possible values:
+  //   * LONG_MAX (Linux Kernel Version < 3.12)
+  //   * ULONG_MAX (3.12 <= Linux Kernel Version < 3.19)
+  //   * LONG_MAX / pageSize * pageSize (Linux Kernel Version >= 3.19)
+  static const size_t pageSize = os::pagesize();
+  Bytes unlimited(static_cast<uint64_t>(LONG_MAX / pageSize * pageSize));
+
+  // NOTE: It's required by the Linux kernel that
+  // 'memory.limit_in_bytes' should be less than or equal to
+  // 'memory.memsw.limit_in_bytes'. Otherwise, the kernel will fail
+  // the cgroup write with EINVAL. As a result, the order of setting
+  // these two control files is important. See MESOS-7237 for details.
+  if (currentLimit.get() >= unlimited) {
+    // This is the first time memory limit is being set. So
+    // effectively we are reducing the memory limits because of which
+    // we need to set the 'memory.limit_in_bytes' before setting
+    // 'memory.memsw.limit_in_bytes'
+    setFunctions = {setLimitInBytes, setMemswLimitInBytes};
+  } else if (limit > currentLimit.get()) {
+    setFunctions = {setMemswLimitInBytes, setLimitInBytes};
+  }
+
+  foreach (const auto& setFunction, setFunctions) {
+    Try<Nothing> result = setFunction();
+    if (result.isError()) {
+      return Failure(result.error());
+    }
   }
 
   return Nothing();