You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ji...@apache.org on 2017/03/18 03:33:27 UTC

[1/2] mesos git commit: Ensured the cgroup memory+swap limit is no less than the memory limit.

Repository: mesos
Updated Branches:
  refs/heads/master d0cd96b8f -> a8527e05b


Ensured the cgroup memory+swap limit is no less than the memory limit.

This addressed the bug we saw in MESOS-7237. Linux kernel will ensure
cgroup memory+swap limit (i.e., 'memory.memsw.limit_in_bytes') is always
no less than the memory limit ('memory.limit_in_bytes'). Prior to this
patch, the memory subsystem in the cgroups isolator is buggy because
memory+swap limit is always changed after the memory limit. This will
cause EINVAL if we increase the memory limit of the container.

Review: https://reviews.apache.org/r/57574


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/9ba473a1
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/9ba473a1
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/9ba473a1

Branch: refs/heads/master
Commit: 9ba473a1e837b0bfb689a5a7f4f5b5d4b12a6db8
Parents: d0cd96b
Author: Jie Yu <yu...@gmail.com>
Authored: Mon Mar 13 16:58:03 2017 -0700
Committer: Jie Yu <yu...@gmail.com>
Committed: Fri Mar 17 20:32:59 2017 -0700

----------------------------------------------------------------------
 .../isolators/cgroups/subsystems/memory.cpp     | 84 +++++++++++++-------
 1 file changed, 56 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/9ba473a1/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
index 009e996..113e908 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
@@ -211,39 +211,16 @@ Future<Nothing> MemorySubsystem::update(
         ": " + currentLimit.error());
   }
 
-  // Determine whether to set the hard limit. We only update the hard
-  // limit if this is the first time or when we're raising the
-  // existing limit, then we can update the hard limit safely.
-  // Otherwise, if we need to decrease 'memory.limit_in_bytes' we may
-  // induce an OOM if too much memory is in use.  As a result, we only
-  // update the soft limit when the memory reservation is being
-  // reduced. This is probably okay if the machine has available
-  // resources.
-  //
-  // TODO(benh): Introduce a MemoryWatcherProcess which monitors the
-  // discrepancy between usage and soft limit and introduces a "manual
-  // oom" if necessary.
-  //
-  // If this is the first time, 'memory.limit_in_bytes' is the inital
-  // value which may be one of following possible values:
-  //   * LONG_MAX (Linux Kernel Version < 3.12)
-  //   * ULONG_MAX (3.12 <= Linux Kernel Version < 3.19)
-  //   * LONG_MAX / pageSize * pageSize (Linux Kernel Version >= 3.19)
-  // Thus, if 'currentLimit' is greater or equals to 'initialLimit'
-  // below, we know it's the first time.
-  static const size_t pageSize = os::pagesize();
-  Bytes initialLimit(static_cast<uint64_t>(LONG_MAX / pageSize * pageSize));
+  bool limitSwap = flags.cgroups_limit_swap;
 
-  if (currentLimit.get() >= initialLimit || limit > currentLimit.get()) {
-    // We always set limit_in_bytes first and optionally set
-    // memsw.limit_in_bytes if `cgroups_limit_swap` is true.
+  auto setLimitInBytes = [=]() -> Try<Nothing> {
     Try<Nothing> write = cgroups::memory::limit_in_bytes(
         hierarchy,
         cgroup,
         limit);
 
     if (write.isError()) {
-      return Failure(
+      return Error(
           "Failed to set 'memory.limit_in_bytes'"
           ": " + write.error());
     }
@@ -251,14 +228,18 @@ Future<Nothing> MemorySubsystem::update(
     LOG(INFO) << "Updated 'memory.limit_in_bytes' to " << limit
               << " for container " << containerId;
 
-    if (flags.cgroups_limit_swap) {
+    return Nothing();
+  };
+
+  auto setMemswLimitInBytes = [=]() -> Try<Nothing> {
+    if (limitSwap) {
       Try<bool> write = cgroups::memory::memsw_limit_in_bytes(
           hierarchy,
           cgroup,
           limit);
 
       if (write.isError()) {
-        return Failure(
+        return Error(
             "Failed to set 'memory.memsw.limit_in_bytes'"
             ": " + write.error());
       }
@@ -266,6 +247,53 @@ Future<Nothing> MemorySubsystem::update(
       LOG(INFO) << "Updated 'memory.memsw.limit_in_bytes' to " << limit
                 << " for container " << containerId;
     }
+
+    return Nothing();
+  };
+
+  vector<lambda::function<Try<Nothing>(void)>> setFunctions;
+
+  // Now, determine whether to set the hard limit. We only update the
+  // hard limit if this is the first time or when we're raising the
+  // existing limit, then we can update the hard limit safely.
+  // Otherwise, if we need to decrease 'memory.limit_in_bytes' we may
+  // induce an OOM if too much memory is in use. As a result, we only
+  // update the soft limit when the memory reservation is being
+  // reduced. This is probably okay if the machine has available
+  // resources.
+  //
+  // TODO(benh): Introduce a MemoryWatcherProcess which monitors the
+  // discrepancy between usage and soft limit and introduces a "manual
+  // oom" if necessary.
+  //
+  // If this is the first time, 'memory.limit_in_bytes' is unlimited
+  // which may be one of following possible values:
+  //   * LONG_MAX (Linux Kernel Version < 3.12)
+  //   * ULONG_MAX (3.12 <= Linux Kernel Version < 3.19)
+  //   * LONG_MAX / pageSize * pageSize (Linux Kernel Version >= 3.19)
+  static const size_t pageSize = os::pagesize();
+  Bytes unlimited(static_cast<uint64_t>(LONG_MAX / pageSize * pageSize));
+
+  // NOTE: It's required by the Linux kernel that
+  // 'memory.limit_in_bytes' should be less than or equal to
+  // 'memory.memsw.limit_in_bytes'. Otherwise, the kernel will fail
+  // the cgroup write with EINVAL. As a result, the order of setting
+  // these two control files is important. See MESOS-7237 for details.
+  if (currentLimit.get() >= unlimited) {
+    // This is the first time memory limit is being set. So
+    // effectively we are reducing the memory limits because of which
+    // we need to set the 'memory.limit_in_bytes' before setting
+    // 'memory.memsw.limit_in_bytes'
+    setFunctions = {setLimitInBytes, setMemswLimitInBytes};
+  } else if (limit > currentLimit.get()) {
+    setFunctions = {setMemswLimitInBytes, setLimitInBytes};
+  }
+
+  foreach (const auto& setFunction, setFunctions) {
+    Try<Nothing> result = setFunction();
+    if (result.isError()) {
+      return Failure(result.error());
+    }
   }
 
   return Nothing();


[2/2] mesos git commit: Added a test to test the cgroup limit swap functionality.

Posted by ji...@apache.org.
Added a test to test the cgroup limit swap functionality.

Review: https://reviews.apache.org/r/57575


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/a8527e05
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/a8527e05
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/a8527e05

Branch: refs/heads/master
Commit: a8527e05b101d56538d09648e8fbae254b1d9a19
Parents: 9ba473a
Author: Jie Yu <yu...@gmail.com>
Authored: Mon Mar 13 17:03:16 2017 -0700
Committer: Jie Yu <yu...@gmail.com>
Committed: Fri Mar 17 20:33:04 2017 -0700

----------------------------------------------------------------------
 .../containerizer/cgroups_isolator_tests.cpp    | 116 +++++++++++++++++++
 1 file changed, 116 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/a8527e05/src/tests/containerizer/cgroups_isolator_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/cgroups_isolator_tests.cpp b/src/tests/containerizer/cgroups_isolator_tests.cpp
index d352e69..4e1d027 100644
--- a/src/tests/containerizer/cgroups_isolator_tests.cpp
+++ b/src/tests/containerizer/cgroups_isolator_tests.cpp
@@ -21,6 +21,8 @@
 #include <stout/format.hpp>
 #include <stout/gtest.hpp>
 
+#include <mesos/v1/scheduler.hpp>
+
 #include "slave/containerizer/mesos/containerizer.hpp"
 
 #include "slave/containerizer/mesos/isolators/cgroups/constants.hpp"
@@ -54,6 +56,8 @@ using mesos::internal::slave::Slave;
 
 using mesos::master::detector::MasterDetector;
 
+using mesos::v1::scheduler::Event;
+
 using process::Future;
 using process::Owned;
 using process::Queue;
@@ -65,7 +69,10 @@ using std::set;
 using std::string;
 using std::vector;
 
+using testing::_;
+using testing::DoAll;
 using testing::InvokeWithoutArgs;
+using testing::Return;
 
 namespace mesos {
 namespace internal {
@@ -441,6 +448,115 @@ TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_CFS_EnableCfs)
 }
 
 
+// This test verifies the limit swap functionality. Note that We use
+// the default executor here in order to exercise both the increasing
+// and decreasing of the memory limit.
+TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_LimitSwap)
+{
+  // Disable AuthN on the agent.
+  slave::Flags flags = CreateSlaveFlags();
+  flags.isolation = "filesystem/linux,cgroups/mem";
+  flags.cgroups_limit_swap = true;
+  flags.authenticate_http_readwrite = false;
+
+  // TODO(jieyu): Add a test filter for memsw support.
+  Result<Bytes> check = cgroups::memory::memsw_limit_in_bytes(
+      path::join(flags.cgroups_hierarchy, "memory"), "/");
+
+  ASSERT_FALSE(check.isError());
+
+  if (check.isNone()) {
+    return;
+  }
+
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), flags);
+  ASSERT_SOME(slave);
+
+  auto scheduler = std::make_shared<v1::MockHTTPScheduler>();
+
+  v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO;
+
+  Future<Nothing> connected;
+  EXPECT_CALL(*scheduler, connected(_))
+    .WillOnce(DoAll(v1::scheduler::SendSubscribe(frameworkInfo),
+                    FutureSatisfy(&connected)));
+
+  v1::scheduler::TestMesos mesos(
+      master.get()->pid, ContentType::PROTOBUF, scheduler);
+
+  AWAIT_READY(connected);
+
+  Future<Event::Subscribed> subscribed;
+  EXPECT_CALL(*scheduler, subscribed(_, _))
+    .WillOnce(FutureArg<1>(&subscribed));
+
+  Future<Event::Offers> offers;
+  EXPECT_CALL(*scheduler, offers(_, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return());
+
+  EXPECT_CALL(*scheduler, heartbeat(_))
+    .WillRepeatedly(Return()); // Ignore heartbeats.
+
+  AWAIT_READY(subscribed);
+
+  v1::FrameworkID frameworkId(subscribed->framework_id());
+  v1::ExecutorInfo executorInfo = v1::createExecutorInfo(
+      "test_default_executor",
+      None(),
+      "cpus:0.1;mem:32;disk:32",
+      v1::ExecutorInfo::DEFAULT);
+
+  // Update `executorInfo` with the subscribed `frameworkId`.
+  executorInfo.mutable_framework_id()->CopyFrom(frameworkId);
+
+  AWAIT_READY(offers);
+  EXPECT_NE(0, offers->offers().size());
+
+  const v1::Offer& offer = offers->offers(0);
+
+  // NOTE: We use a non-shell command here because 'sh' might not be
+  // in the PATH. 'alpine' does not specify env PATH in the image. On
+  // some linux distribution, '/bin' is not in the PATH by default.
+  v1::TaskInfo taskInfo = v1::createTask(
+      offer.agent_id(),
+      v1::Resources::parse("cpus:0.1;mem:32;disk:32").get(),
+      v1::createCommandInfo("ls", {"ls", "-al", "/"}));
+
+  Future<Event::Update> updateRunning;
+  EXPECT_CALL(*scheduler, update(_, _))
+    .WillOnce(DoAll(FutureArg<1>(&updateRunning),
+                    v1::scheduler::SendAcknowledge(
+                        frameworkId,
+                        offer.agent_id())));
+
+  v1::Offer::Operation launchGroup = v1::LAUNCH_GROUP(
+      executorInfo,
+      v1::createTaskGroupInfo({taskInfo}));
+
+  mesos.send(v1::createCallAccept(frameworkId, offer, {launchGroup}));
+
+  AWAIT_READY(updateRunning);
+  ASSERT_EQ(v1::TASK_RUNNING, updateRunning->status().state());
+  EXPECT_EQ(taskInfo.task_id(), updateRunning->status().task_id());
+  EXPECT_TRUE(updateRunning->status().has_timestamp());
+
+  Future<Event::Update> updateFinished;
+  EXPECT_CALL(*scheduler, update(_, _))
+    .WillOnce(FutureArg<1>(&updateFinished));
+
+  AWAIT_READY(updateFinished);
+  ASSERT_EQ(v1::TASK_FINISHED, updateFinished->status().state());
+  EXPECT_EQ(taskInfo.task_id(), updateFinished->status().task_id());
+  EXPECT_TRUE(updateFinished->status().has_timestamp());
+}
+
+
 // The test verifies that the number of processes and threads in a
 // container is correctly reported.
 TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_PidsAndTids)