You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by gr...@apache.org on 2020/04/07 01:49:52 UTC
[mesos] 01/03: Added agent-side validation for resource limits.
This is an automated email from the ASF dual-hosted git repository.
grag pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git
commit ad612599df9e866b2a622baa4cdb659ece5c4574
Author: Greg Mann <gr...@mesosphere.io>
AuthorDate: Mon Apr 6 15:16:33 2020 -0700
Added agent-side validation for resource limits.
This prevents tasks from being launched on agents which would
not be capable of enforcing the specified limits.
Review: https://reviews.apache.org/r/72297/
---
src/slave/slave.cpp | 64 +++++++++++++++++++++++++++++++++--
src/slave/slave.hpp | 3 ++
src/tests/master_tests.cpp | 10 +++++-
src/tests/master_validation_tests.cpp | 20 +++++++++--
4 files changed, 91 insertions(+), 6 deletions(-)
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 6a48023..1a32c81 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -2168,6 +2168,34 @@ void Slave::runTask(
}
+Option<Error> Slave::validateResourceLimitsAndIsolators(
+ const vector<TaskInfo>& tasks)
+{
+ foreach (const TaskInfo& task, tasks) {
+ if (!(task.has_container() &&
+ task.container().type() == ContainerInfo::DOCKER)) {
+ if (task.limits().count("cpus") &&
+ !(strings::contains(flags.isolation, "cgroups/cpu") ||
+ strings::contains(flags.isolation, "cgroups/all"))) {
+ return Error(
+ "CPU limits can only be set on tasks launched in Mesos containers"
+ " when the agent has loaded the 'cgroups/cpu' isolator");
+ }
+
+ if (task.limits().count("mem") &&
+ !(strings::contains(flags.isolation, "cgroups/mem") ||
+ strings::contains(flags.isolation, "cgroups/all"))) {
+ return Error(
+ "Memory limits can only be set on tasks launched in Mesos"
+ " containers when the agent has loaded the 'cgroups/mem' isolator");
+ }
+ }
+ }
+
+ return None();
+}
+
+
void Slave::run(
const FrameworkInfo& frameworkInfo,
ExecutorInfo executorInfo,
@@ -2320,6 +2348,40 @@ void Slave::run(
}
}
+ CHECK_NOTNULL(framework);
+
+ Option<Error> error = validateResourceLimitsAndIsolators(tasks);
+ if (error.isSome()) {
+ // We report TASK_DROPPED to the framework because the task was
+ // never launched. For non-partition-aware frameworks, we report
+ // TASK_LOST for backward compatibility.
+ mesos::TaskState taskState = TASK_DROPPED;
+ if (!protobuf::frameworkHasCapability(
+ frameworkInfo, FrameworkInfo::Capability::PARTITION_AWARE)) {
+ taskState = TASK_LOST;
+ }
+
+ foreach (const TaskInfo& _task, tasks) {
+ const StatusUpdate update = protobuf::createStatusUpdate(
+ frameworkId,
+ info.id(),
+ _task.task_id(),
+ taskState,
+ TaskStatus::SOURCE_SLAVE,
+ id::UUID::random(),
+ error->message,
+ TaskStatus::REASON_GC_ERROR);
+
+ statusUpdate(update, UPID());
+ }
+
+ if (framework->idle()) {
+ removeFramework(framework);
+ }
+
+ return;
+ }
+
const ExecutorID& executorId = executorInfo.executor_id();
if (HookManager::hooksAvailable()) {
@@ -2342,8 +2404,6 @@ void Slave::run(
}
}
- CHECK_NOTNULL(framework);
-
// Track the pending task / task group to ensure the framework is
// not removed and the framework and top level executor directories
// are not scheduled for deletion before '_run()' is called.
diff --git a/src/slave/slave.hpp b/src/slave/slave.hpp
index d7e65e0..2cf45c6 100644
--- a/src/slave/slave.hpp
+++ b/src/slave/slave.hpp
@@ -158,6 +158,9 @@ public:
const process::UPID& from,
RunTaskMessage&& runTaskMessage);
+ Option<Error> validateResourceLimitsAndIsolators(
+ const std::vector<TaskInfo>& tasks);
+
// Made 'virtual' for Slave mocking.
virtual void runTask(
const process::UPID& from,
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 8cc8d20..785e5d5 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -4330,7 +4330,15 @@ TEST_F(MasterTest, TasksEndpoint)
TestContainerizer containerizer(&exec);
Owned<MasterDetector> detector = master.get()->createDetector();
- Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), &containerizer);
+
+ // We must enable the CPU and memory isolators on the agent so that it can
+ // accept resource limits.
+ slave::Flags flags = CreateSlaveFlags();
+ flags.isolation = "cgroups/cpu,cgroups/mem";
+
+ Try<Owned<cluster::Slave>> slave =
+ StartSlave(detector.get(), &containerizer, flags);
+
ASSERT_SOME(slave);
MockScheduler sched;
diff --git a/src/tests/master_validation_tests.cpp b/src/tests/master_validation_tests.cpp
index 9efca42..816635a 100644
--- a/src/tests/master_validation_tests.cpp
+++ b/src/tests/master_validation_tests.cpp
@@ -5036,7 +5036,13 @@ TEST_F(TaskGroupValidationTest, ResourceLimits)
AWAIT_READY(offers);
ASSERT_FALSE(offers->empty());
- // Valid, no error.
+ // Erase the memory limit so we can be sure what error message to look for
+ // in the next test case.
+ task1.mutable_limits()->erase("mem");
+ task2.mutable_limits()->erase("mem");
+
+ // Error: the agent does not have any isolators loaded, so it can't enforce
+ // resource limits and does not launch the task group.
{
TaskGroupInfo taskGroup;
taskGroup.add_tasks()->CopyFrom(task1);
@@ -5061,10 +5067,18 @@ TEST_F(TaskGroupValidationTest, ResourceLimits)
driver.acceptOffers({offers->at(0).id()}, {operation}, filters);
AWAIT_READY(task1Status);
- EXPECT_EQ(TASK_STARTING, task1Status->state());
+ EXPECT_EQ(TASK_LOST, task1Status->state());
+ EXPECT_EQ(
+ "CPU limits can only be set on tasks launched in Mesos containers"
+ " when the agent has loaded the 'cgroups/cpu' isolator",
+ task1Status->message());
AWAIT_READY(task2Status);
- EXPECT_EQ(TASK_STARTING, task2Status->state());
+ EXPECT_EQ(TASK_LOST, task2Status->state());
+ EXPECT_EQ(
+ "CPU limits can only be set on tasks launched in Mesos containers"
+ " when the agent has loaded the 'cgroups/cpu' isolator",
+ task2Status->message());
}
}