You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ji...@apache.org on 2017/03/18 03:33:27 UTC
[1/2] mesos git commit: Ensured the cgroup memory+swap limit is no
less than the memory limit.
Repository: mesos
Updated Branches:
refs/heads/master d0cd96b8f -> a8527e05b
Ensured the cgroup memory+swap limit is no less than the memory limit.
This addressed the bug we saw in MESOS-7237. Linux kernel will ensure
cgroup memory+swap limit (i.e., 'memory.memsw.limit_in_bytes') is always
no less than the memory limit ('memory.limit_in_bytes'). Prior to this
patch, the memory subsystem in the cgroups isolator is buggy because
memory+swap limit is always changed after the memory limit. This will
cause EINVAL if we increase the memory limit of the container.
Review: https://reviews.apache.org/r/57574
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/9ba473a1
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/9ba473a1
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/9ba473a1
Branch: refs/heads/master
Commit: 9ba473a1e837b0bfb689a5a7f4f5b5d4b12a6db8
Parents: d0cd96b
Author: Jie Yu <yu...@gmail.com>
Authored: Mon Mar 13 16:58:03 2017 -0700
Committer: Jie Yu <yu...@gmail.com>
Committed: Fri Mar 17 20:32:59 2017 -0700
----------------------------------------------------------------------
.../isolators/cgroups/subsystems/memory.cpp | 84 +++++++++++++-------
1 file changed, 56 insertions(+), 28 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/9ba473a1/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
index 009e996..113e908 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
@@ -211,39 +211,16 @@ Future<Nothing> MemorySubsystem::update(
": " + currentLimit.error());
}
- // Determine whether to set the hard limit. We only update the hard
- // limit if this is the first time or when we're raising the
- // existing limit, then we can update the hard limit safely.
- // Otherwise, if we need to decrease 'memory.limit_in_bytes' we may
- // induce an OOM if too much memory is in use. As a result, we only
- // update the soft limit when the memory reservation is being
- // reduced. This is probably okay if the machine has available
- // resources.
- //
- // TODO(benh): Introduce a MemoryWatcherProcess which monitors the
- // discrepancy between usage and soft limit and introduces a "manual
- // oom" if necessary.
- //
- // If this is the first time, 'memory.limit_in_bytes' is the inital
- // value which may be one of following possible values:
- // * LONG_MAX (Linux Kernel Version < 3.12)
- // * ULONG_MAX (3.12 <= Linux Kernel Version < 3.19)
- // * LONG_MAX / pageSize * pageSize (Linux Kernel Version >= 3.19)
- // Thus, if 'currentLimit' is greater or equals to 'initialLimit'
- // below, we know it's the first time.
- static const size_t pageSize = os::pagesize();
- Bytes initialLimit(static_cast<uint64_t>(LONG_MAX / pageSize * pageSize));
+ bool limitSwap = flags.cgroups_limit_swap;
- if (currentLimit.get() >= initialLimit || limit > currentLimit.get()) {
- // We always set limit_in_bytes first and optionally set
- // memsw.limit_in_bytes if `cgroups_limit_swap` is true.
+ auto setLimitInBytes = [=]() -> Try<Nothing> {
Try<Nothing> write = cgroups::memory::limit_in_bytes(
hierarchy,
cgroup,
limit);
if (write.isError()) {
- return Failure(
+ return Error(
"Failed to set 'memory.limit_in_bytes'"
": " + write.error());
}
@@ -251,14 +228,18 @@ Future<Nothing> MemorySubsystem::update(
LOG(INFO) << "Updated 'memory.limit_in_bytes' to " << limit
<< " for container " << containerId;
- if (flags.cgroups_limit_swap) {
+ return Nothing();
+ };
+
+ auto setMemswLimitInBytes = [=]() -> Try<Nothing> {
+ if (limitSwap) {
Try<bool> write = cgroups::memory::memsw_limit_in_bytes(
hierarchy,
cgroup,
limit);
if (write.isError()) {
- return Failure(
+ return Error(
"Failed to set 'memory.memsw.limit_in_bytes'"
": " + write.error());
}
@@ -266,6 +247,53 @@ Future<Nothing> MemorySubsystem::update(
LOG(INFO) << "Updated 'memory.memsw.limit_in_bytes' to " << limit
<< " for container " << containerId;
}
+
+ return Nothing();
+ };
+
+ vector<lambda::function<Try<Nothing>(void)>> setFunctions;
+
+ // Now, determine whether to set the hard limit. We only update the
+ // hard limit if this is the first time or when we're raising the
+ // existing limit, then we can update the hard limit safely.
+ // Otherwise, if we need to decrease 'memory.limit_in_bytes' we may
+ // induce an OOM if too much memory is in use. As a result, we only
+ // update the soft limit when the memory reservation is being
+ // reduced. This is probably okay if the machine has available
+ // resources.
+ //
+ // TODO(benh): Introduce a MemoryWatcherProcess which monitors the
+ // discrepancy between usage and soft limit and introduces a "manual
+ // oom" if necessary.
+ //
+ // If this is the first time, 'memory.limit_in_bytes' is unlimited
+ // which may be one of following possible values:
+ // * LONG_MAX (Linux Kernel Version < 3.12)
+ // * ULONG_MAX (3.12 <= Linux Kernel Version < 3.19)
+ // * LONG_MAX / pageSize * pageSize (Linux Kernel Version >= 3.19)
+ static const size_t pageSize = os::pagesize();
+ Bytes unlimited(static_cast<uint64_t>(LONG_MAX / pageSize * pageSize));
+
+ // NOTE: It's required by the Linux kernel that
+ // 'memory.limit_in_bytes' should be less than or equal to
+ // 'memory.memsw.limit_in_bytes'. Otherwise, the kernel will fail
+ // the cgroup write with EINVAL. As a result, the order of setting
+ // these two control files is important. See MESOS-7237 for details.
+ if (currentLimit.get() >= unlimited) {
+ // This is the first time memory limit is being set. So
+ // effectively we are reducing the memory limits because of which
+ // we need to set the 'memory.limit_in_bytes' before setting
+ // 'memory.memsw.limit_in_bytes'
+ setFunctions = {setLimitInBytes, setMemswLimitInBytes};
+ } else if (limit > currentLimit.get()) {
+ setFunctions = {setMemswLimitInBytes, setLimitInBytes};
+ }
+
+ foreach (const auto& setFunction, setFunctions) {
+ Try<Nothing> result = setFunction();
+ if (result.isError()) {
+ return Failure(result.error());
+ }
}
return Nothing();
[2/2] mesos git commit: Added a test to test the cgroup limit swap
functionality.
Posted by ji...@apache.org.
Added a test to test the cgroup limit swap functionality.
Review: https://reviews.apache.org/r/57575
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/a8527e05
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/a8527e05
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/a8527e05
Branch: refs/heads/master
Commit: a8527e05b101d56538d09648e8fbae254b1d9a19
Parents: 9ba473a
Author: Jie Yu <yu...@gmail.com>
Authored: Mon Mar 13 17:03:16 2017 -0700
Committer: Jie Yu <yu...@gmail.com>
Committed: Fri Mar 17 20:33:04 2017 -0700
----------------------------------------------------------------------
.../containerizer/cgroups_isolator_tests.cpp | 116 +++++++++++++++++++
1 file changed, 116 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/a8527e05/src/tests/containerizer/cgroups_isolator_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/cgroups_isolator_tests.cpp b/src/tests/containerizer/cgroups_isolator_tests.cpp
index d352e69..4e1d027 100644
--- a/src/tests/containerizer/cgroups_isolator_tests.cpp
+++ b/src/tests/containerizer/cgroups_isolator_tests.cpp
@@ -21,6 +21,8 @@
#include <stout/format.hpp>
#include <stout/gtest.hpp>
+#include <mesos/v1/scheduler.hpp>
+
#include "slave/containerizer/mesos/containerizer.hpp"
#include "slave/containerizer/mesos/isolators/cgroups/constants.hpp"
@@ -54,6 +56,8 @@ using mesos::internal::slave::Slave;
using mesos::master::detector::MasterDetector;
+using mesos::v1::scheduler::Event;
+
using process::Future;
using process::Owned;
using process::Queue;
@@ -65,7 +69,10 @@ using std::set;
using std::string;
using std::vector;
+using testing::_;
+using testing::DoAll;
using testing::InvokeWithoutArgs;
+using testing::Return;
namespace mesos {
namespace internal {
@@ -441,6 +448,115 @@ TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_CFS_EnableCfs)
}
+// This test verifies the limit swap functionality. Note that We use
+// the default executor here in order to exercise both the increasing
+// and decreasing of the memory limit.
+TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_LimitSwap)
+{
+ // Disable AuthN on the agent.
+ slave::Flags flags = CreateSlaveFlags();
+ flags.isolation = "filesystem/linux,cgroups/mem";
+ flags.cgroups_limit_swap = true;
+ flags.authenticate_http_readwrite = false;
+
+ // TODO(jieyu): Add a test filter for memsw support.
+ Result<Bytes> check = cgroups::memory::memsw_limit_in_bytes(
+ path::join(flags.cgroups_hierarchy, "memory"), "/");
+
+ ASSERT_FALSE(check.isError());
+
+ if (check.isNone()) {
+ return;
+ }
+
+ Try<Owned<cluster::Master>> master = StartMaster();
+ ASSERT_SOME(master);
+
+ Owned<MasterDetector> detector = master.get()->createDetector();
+
+ Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), flags);
+ ASSERT_SOME(slave);
+
+ auto scheduler = std::make_shared<v1::MockHTTPScheduler>();
+
+ v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO;
+
+ Future<Nothing> connected;
+ EXPECT_CALL(*scheduler, connected(_))
+ .WillOnce(DoAll(v1::scheduler::SendSubscribe(frameworkInfo),
+ FutureSatisfy(&connected)));
+
+ v1::scheduler::TestMesos mesos(
+ master.get()->pid, ContentType::PROTOBUF, scheduler);
+
+ AWAIT_READY(connected);
+
+ Future<Event::Subscribed> subscribed;
+ EXPECT_CALL(*scheduler, subscribed(_, _))
+ .WillOnce(FutureArg<1>(&subscribed));
+
+ Future<Event::Offers> offers;
+ EXPECT_CALL(*scheduler, offers(_, _))
+ .WillOnce(FutureArg<1>(&offers))
+ .WillRepeatedly(Return());
+
+ EXPECT_CALL(*scheduler, heartbeat(_))
+ .WillRepeatedly(Return()); // Ignore heartbeats.
+
+ AWAIT_READY(subscribed);
+
+ v1::FrameworkID frameworkId(subscribed->framework_id());
+ v1::ExecutorInfo executorInfo = v1::createExecutorInfo(
+ "test_default_executor",
+ None(),
+ "cpus:0.1;mem:32;disk:32",
+ v1::ExecutorInfo::DEFAULT);
+
+ // Update `executorInfo` with the subscribed `frameworkId`.
+ executorInfo.mutable_framework_id()->CopyFrom(frameworkId);
+
+ AWAIT_READY(offers);
+ EXPECT_NE(0, offers->offers().size());
+
+ const v1::Offer& offer = offers->offers(0);
+
+ // NOTE: We use a non-shell command here because 'sh' might not be
+ // in the PATH. 'alpine' does not specify env PATH in the image. On
+ // some linux distribution, '/bin' is not in the PATH by default.
+ v1::TaskInfo taskInfo = v1::createTask(
+ offer.agent_id(),
+ v1::Resources::parse("cpus:0.1;mem:32;disk:32").get(),
+ v1::createCommandInfo("ls", {"ls", "-al", "/"}));
+
+ Future<Event::Update> updateRunning;
+ EXPECT_CALL(*scheduler, update(_, _))
+ .WillOnce(DoAll(FutureArg<1>(&updateRunning),
+ v1::scheduler::SendAcknowledge(
+ frameworkId,
+ offer.agent_id())));
+
+ v1::Offer::Operation launchGroup = v1::LAUNCH_GROUP(
+ executorInfo,
+ v1::createTaskGroupInfo({taskInfo}));
+
+ mesos.send(v1::createCallAccept(frameworkId, offer, {launchGroup}));
+
+ AWAIT_READY(updateRunning);
+ ASSERT_EQ(v1::TASK_RUNNING, updateRunning->status().state());
+ EXPECT_EQ(taskInfo.task_id(), updateRunning->status().task_id());
+ EXPECT_TRUE(updateRunning->status().has_timestamp());
+
+ Future<Event::Update> updateFinished;
+ EXPECT_CALL(*scheduler, update(_, _))
+ .WillOnce(FutureArg<1>(&updateFinished));
+
+ AWAIT_READY(updateFinished);
+ ASSERT_EQ(v1::TASK_FINISHED, updateFinished->status().state());
+ EXPECT_EQ(taskInfo.task_id(), updateFinished->status().task_id());
+ EXPECT_TRUE(updateFinished->status().has_timestamp());
+}
+
+
// The test verifies that the number of processes and threads in a
// container is correctly reported.
TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_PidsAndTids)