You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by qi...@apache.org on 2020/03/20 09:03:05 UTC
[mesos] 08/21: Set container process's OOM score adjust.
This is an automated email from the ASF dual-hosted git repository.
qianzhang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 8d51df87b058144d0ce51008e393b6261b6e9765
Author: Qian Zhang <zh...@gmail.com>
AuthorDate: Thu Jan 2 09:05:43 2020 +0800
Set container process's OOM score adjust.
Review: https://reviews.apache.org/r/71944
---
.../mesos/isolators/cgroups/cgroups.cpp | 8 +-
.../mesos/isolators/cgroups/subsystem.cpp | 9 ++-
.../mesos/isolators/cgroups/subsystem.hpp | 7 +-
.../mesos/isolators/cgroups/subsystems/devices.cpp | 3 +-
.../mesos/isolators/cgroups/subsystems/devices.hpp | 3 +-
.../mesos/isolators/cgroups/subsystems/memory.cpp | 87 +++++++++++++++++++++-
.../mesos/isolators/cgroups/subsystems/memory.hpp | 14 +++-
.../mesos/isolators/cgroups/subsystems/net_cls.cpp | 3 +-
.../mesos/isolators/cgroups/subsystems/net_cls.hpp | 3 +-
.../isolators/cgroups/subsystems/perf_event.cpp | 3 +-
.../isolators/cgroups/subsystems/perf_event.hpp | 3 +-
src/slave/containerizer/mesos/utils.cpp | 20 +++++
src/slave/containerizer/mesos/utils.hpp | 3 +
13 files changed, 148 insertions(+), 18 deletions(-)
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
index 8e858f4..4193538 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
@@ -484,7 +484,8 @@ Future<Option<ContainerLaunchInfo>> CgroupsIsolatorProcess::prepare(
infos[containerId]->subsystems.insert(subsystem->name());
prepares.push_back(subsystem->prepare(
containerId,
- infos[containerId]->cgroup));
+ infos[containerId]->cgroup,
+ containerConfig));
}
// Chown the cgroup so the executor or a nested container whose
@@ -705,10 +706,7 @@ Future<Nothing> CgroupsIsolatorProcess::isolate(
// containers with shared cgroups, because we don't call `prepare()`,
// `recover()`, or `cleanup()` on them either. If we were to call
// `isolate()` on them, the call would likely fail because the subsystem
- // doesn't know about the container. This is currently OK because
- // the only cgroup isolator that even implements `isolate()` is the
- // `NetClsSubsystem` and it doesn't do anything with the `pid`
- // passed in.
+ // doesn't know about the container.
//
// TODO(klueska): In the future we should revisit this to make
// sure that doing things this way is sufficient (or otherwise
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystem.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystem.cpp
index d9c8fa7..6393bee 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystem.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystem.cpp
@@ -116,13 +116,15 @@ Future<Nothing> Subsystem::recover(
Future<Nothing> Subsystem::prepare(
const ContainerID& containerId,
- const string& cgroup)
+ const string& cgroup,
+ const mesos::slave::ContainerConfig& containerConfig)
{
return process::dispatch(
process.get(),
&SubsystemProcess::prepare,
containerId,
- cgroup);
+ cgroup,
+ containerConfig);
}
@@ -221,7 +223,8 @@ Future<Nothing> SubsystemProcess::recover(
Future<Nothing> SubsystemProcess::prepare(
const ContainerID& containerId,
- const string& cgroup)
+ const string& cgroup,
+ const mesos::slave::ContainerConfig& containerConfig)
{
return Nothing();
}
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystem.hpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystem.hpp
index 088d417..7d33901 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystem.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystem.hpp
@@ -90,11 +90,13 @@ public:
*
* @param containerId The target containerId.
* @param cgroup The target cgroup.
+ * @param containerConfig The container configuration.
* @return Nothing or an error if `prepare` fails.
*/
process::Future<Nothing> prepare(
const ContainerID& containerId,
- const std::string& cgroup);
+ const std::string& cgroup,
+ const mesos::slave::ContainerConfig& containerConfig);
/**
* Isolate the associated container to cgroups subsystem.
@@ -198,7 +200,8 @@ public:
virtual process::Future<Nothing> prepare(
const ContainerID& containerId,
- const std::string& cgroup);
+ const std::string& cgroup,
+ const mesos::slave::ContainerConfig& containerConfig);
virtual process::Future<Nothing> isolate(
const ContainerID& containerId,
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.cpp
index ac2e66b..d1de13a 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.cpp
@@ -167,7 +167,8 @@ Future<Nothing> DevicesSubsystemProcess::recover(
Future<Nothing> DevicesSubsystemProcess::prepare(
const ContainerID& containerId,
- const string& cgroup)
+ const string& cgroup,
+ const mesos::slave::ContainerConfig& containerConfig)
{
if (containerIds.contains(containerId)) {
return Failure("The subsystem '" + name() + "' has already been prepared");
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.hpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.hpp
index c62deec..8c34c80 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.hpp
@@ -55,7 +55,8 @@ public:
process::Future<Nothing> prepare(
const ContainerID& containerId,
- const std::string& cgroup) override;
+ const std::string& cgroup,
+ const mesos::slave::ContainerConfig& containerConfig) override;
process::Future<Nothing> recover(
const ContainerID& containerId,
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
index 4102985..15f87ba 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
@@ -32,11 +32,14 @@
#include "common/protobuf_utils.hpp"
+#include "slave/containerizer/mesos/utils.hpp"
+
#include "slave/containerizer/mesos/isolators/cgroups/subsystems/memory.hpp"
using cgroups::memory::pressure::Counter;
using cgroups::memory::pressure::Level;
+using mesos::slave::ContainerConfig;
using mesos::slave::ContainerLimitation;
using process::Failure;
@@ -135,7 +138,8 @@ Future<Nothing> MemorySubsystemProcess::recover(
Future<Nothing> MemorySubsystemProcess::prepare(
const ContainerID& containerId,
- const string& cgroup)
+ const string& cgroup,
+ const ContainerConfig& containerConfig)
{
if (infos.contains(containerId)) {
return Failure("The subsystem '" + name() + "' has already been prepared");
@@ -143,6 +147,7 @@ Future<Nothing> MemorySubsystemProcess::prepare(
infos.put(containerId, Owned<Info>(new Info));
infos[containerId]->hardLimitUpdated = false;
+ infos[containerId]->isCommandTask = containerConfig.has_task_info();
oomListen(containerId, cgroup);
pressureListen(containerId, cgroup);
@@ -151,6 +156,86 @@ Future<Nothing> MemorySubsystemProcess::prepare(
}
+Future<Nothing> MemorySubsystemProcess::isolate(
+ const ContainerID& containerId,
+ const string& cgroup,
+ pid_t pid)
+{
+ if (!infos.contains(containerId)) {
+ return Failure(
+ "Failed to isolate subsystem '" + name() + "'"
+ ": Unknown container");
+ }
+
+ // Get the soft limit.
+ Try<Bytes> softLimit =
+ cgroups::memory::soft_limit_in_bytes(hierarchy, cgroup);
+
+ if (softLimit.isError()) {
+ return Failure(
+ "Failed to read 'memory.soft_limit_in_bytes'"
+ ": " + softLimit.error());
+ }
+
+ // Get the hard limit.
+ Try<Bytes> hardLimit =
+ cgroups::memory::limit_in_bytes(hierarchy, cgroup);
+
+ if (hardLimit.isError()) {
+ return Failure(
+ "Failed to read 'memory.limit_in_bytes'"
+ ": " + hardLimit.error());
+ }
+
+ // While the OOM score of a process is a complex function of the process state
+ // and configuration, a decent approximation of the OOM score is 10 x percent
+ // of memory used by the process + `/proc/$pid/oom_score_adj` (a configurable
+ // quantity which is between -1000 and 1000). Containers with higher OOM
+ // scores are killed if the system runs out of memory.
+ //
+ // We would like burstable task containers which consume more memory than
+ // their memory requests (i.e. soft limits) to be preferentially OOM-killed
+ // first. To accomplish this, we set their OOM score adjustment as shown
+ // below, which attempts to ensure that the container which consumes more
+ // memory than its memory request will have an OOM score of 1000.
+ //
+ // Please note that there are two kinds of burstable task containers:
+ // 1. Command task containers whose soft limit < hard limit.
+ // 2. Nested task containers whose soft limit < hard limit.
+ //
+ // For any other kinds of containers (see below), we will just leave their OOM
+ // score adjustments at the default value (i.e. 0).
+ // 1. Containers whose soft limit == hard limit, this is to ensure backward
+ // compatibility.
+ // 2. Default executor containers whose soft limit < hard limit.
+ // 3. Custom executor containers whose soft limit < hard limit.
+ // 4. Debug containers.
+ if (softLimit.get() < hardLimit.get() &&
+ (infos[containerId]->isCommandTask || containerId.has_parent())) {
+ Try<int> oomScoreAdj = calculateOOMScoreAdj(softLimit.get());
+ if (oomScoreAdj.isError()) {
+ return Failure(
+ "Failed to calculate OOM score adjustment: " + oomScoreAdj.error());
+ }
+
+ const string oomScoreAdjPath =
+ strings::format("/proc/%d/oom_score_adj", pid).get();
+
+ Try<Nothing> write =
+ os::write(oomScoreAdjPath, stringify(oomScoreAdj.get()));
+
+ if (write.isError()) {
+ return Failure("Failed to set OOM score adjustment: " + write.error());
+ }
+
+ LOG(INFO) << "Set " << oomScoreAdjPath << " to " << oomScoreAdj.get()
+ << " for container " << containerId;
+ }
+
+ return Nothing();
+}
+
+
Future<ContainerLimitation> MemorySubsystemProcess::watch(
const ContainerID& containerId,
const string& cgroup)
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.hpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.hpp
index ed34df8..a4bbef8 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.hpp
@@ -57,7 +57,13 @@ public:
process::Future<Nothing> prepare(
const ContainerID& containerId,
- const std::string& cgroup) override;
+ const std::string& cgroup,
+ const mesos::slave::ContainerConfig& containerConfig) override;
+
+ process::Future<Nothing> isolate(
+ const ContainerID& containerId,
+ const std::string& cgroup,
+ pid_t pid) override;
process::Future<Nothing> recover(
const ContainerID& containerId,
@@ -97,6 +103,12 @@ private:
// Indicate whether the memory hard limit of this container has
// already been updated.
bool hardLimitUpdated;
+
+ // Indicates whether this is a command task container. Please note
+ // that we only need to use this field in isolating phase, so we do
+ // not recover it after agent restarts, that means its value may not
+ // be correct after agent recovery.
+ bool isCommandTask;
};
MemorySubsystemProcess(const Flags& flags, const std::string& hierarchy);
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.cpp
index ec2ce67..e140194 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.cpp
@@ -336,7 +336,8 @@ Future<Nothing> NetClsSubsystemProcess::recover(
Future<Nothing> NetClsSubsystemProcess::prepare(
const ContainerID& containerId,
- const string& cgroup)
+ const string& cgroup,
+ const mesos::slave::ContainerConfig& containerConfig)
{
if (infos.contains(containerId)) {
return Failure("The subsystem '" + name() + "' has already been prepared");
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.hpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.hpp
index 0653107..c604a2b 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.hpp
@@ -150,7 +150,8 @@ public:
process::Future<Nothing> prepare(
const ContainerID& containerId,
- const std::string& cgroup) override;
+ const std::string& cgroup,
+ const mesos::slave::ContainerConfig& containerConfig) override;
process::Future<Nothing> isolate(
const ContainerID& containerId,
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.cpp
index 180afc9..e88eea4 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.cpp
@@ -115,7 +115,8 @@ Future<Nothing> PerfEventSubsystemProcess::recover(
Future<Nothing> PerfEventSubsystemProcess::prepare(
const ContainerID& containerId,
- const string& cgroup)
+ const string& cgroup,
+ const mesos::slave::ContainerConfig& containerConfig)
{
if (infos.contains(containerId)) {
return Failure("The subsystem '" + name() + "' has already been prepared");
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.hpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.hpp
index 2c865ac..cac04fe 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.hpp
@@ -57,7 +57,8 @@ public:
process::Future<Nothing> prepare(
const ContainerID& containerId,
- const std::string& cgroup) override;
+ const std::string& cgroup,
+ const mesos::slave::ContainerConfig& containerConfig) override;
process::Future<Nothing> recover(
const ContainerID& containerId,
diff --git a/src/slave/containerizer/mesos/utils.cpp b/src/slave/containerizer/mesos/utils.cpp
index d9964f0..970aa59 100644
--- a/src/slave/containerizer/mesos/utils.cpp
+++ b/src/slave/containerizer/mesos/utils.cpp
@@ -140,6 +140,26 @@ Try<pid_t> getMountNamespaceTarget(pid_t parent)
}
#endif // __linux__
+
+Try<int> calculateOOMScoreAdj(const Bytes& memRequest)
+{
+ // Get the total memory of this node.
+ static Option<Bytes> totalMem;
+ if (totalMem.isNone()) {
+ Try<os::Memory> mem = os::memory();
+ if (mem.isError()) {
+ return Error(
+ "Failed to auto-detect the size of main memory: " + mem.error());
+ }
+
+ totalMem = mem->total;
+ }
+
+ CHECK_SOME(totalMem);
+
+ return 1000 - (1000 * memRequest.bytes()) / totalMem->bytes();
+}
+
} // namespace slave {
} // namespace internal {
} // namespace mesos {
diff --git a/src/slave/containerizer/mesos/utils.hpp b/src/slave/containerizer/mesos/utils.hpp
index bfd07e2..4a31dfd 100644
--- a/src/slave/containerizer/mesos/utils.hpp
+++ b/src/slave/containerizer/mesos/utils.hpp
@@ -30,6 +30,9 @@ namespace slave {
Try<pid_t> getMountNamespaceTarget(pid_t parent);
#endif // __linux__
+
+Try<int> calculateOOMScoreAdj(const Bytes& memRequest);
+
} // namespace slave {
} // namespace internal {
} // namespace mesos {