You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by qi...@apache.org on 2020/03/20 09:03:05 UTC

[mesos] 08/21: Set container process's OOM score adjust.

This is an automated email from the ASF dual-hosted git repository.

qianzhang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 8d51df87b058144d0ce51008e393b6261b6e9765
Author: Qian Zhang <zh...@gmail.com>
AuthorDate: Thu Jan 2 09:05:43 2020 +0800

    Set container process's OOM score adjust.
    
    Review: https://reviews.apache.org/r/71944
---
 .../mesos/isolators/cgroups/cgroups.cpp            |  8 +-
 .../mesos/isolators/cgroups/subsystem.cpp          |  9 ++-
 .../mesos/isolators/cgroups/subsystem.hpp          |  7 +-
 .../mesos/isolators/cgroups/subsystems/devices.cpp |  3 +-
 .../mesos/isolators/cgroups/subsystems/devices.hpp |  3 +-
 .../mesos/isolators/cgroups/subsystems/memory.cpp  | 87 +++++++++++++++++++++-
 .../mesos/isolators/cgroups/subsystems/memory.hpp  | 14 +++-
 .../mesos/isolators/cgroups/subsystems/net_cls.cpp |  3 +-
 .../mesos/isolators/cgroups/subsystems/net_cls.hpp |  3 +-
 .../isolators/cgroups/subsystems/perf_event.cpp    |  3 +-
 .../isolators/cgroups/subsystems/perf_event.hpp    |  3 +-
 src/slave/containerizer/mesos/utils.cpp            | 20 +++++
 src/slave/containerizer/mesos/utils.hpp            |  3 +
 13 files changed, 148 insertions(+), 18 deletions(-)

diff --git a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
index 8e858f4..4193538 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
@@ -484,7 +484,8 @@ Future<Option<ContainerLaunchInfo>> CgroupsIsolatorProcess::prepare(
       infos[containerId]->subsystems.insert(subsystem->name());
       prepares.push_back(subsystem->prepare(
           containerId,
-          infos[containerId]->cgroup));
+          infos[containerId]->cgroup,
+          containerConfig));
     }
 
     // Chown the cgroup so the executor or a nested container whose
@@ -705,10 +706,7 @@ Future<Nothing> CgroupsIsolatorProcess::isolate(
   // containers with shared cgroups, because we don't call `prepare()`,
   // `recover()`, or `cleanup()` on them either. If we were to call
   // `isolate()` on them, the call would likely fail because the subsystem
-  // doesn't know about the container. This is currently OK because
-  // the only cgroup isolator that even implements `isolate()` is the
-  // `NetClsSubsystem` and it doesn't do anything with the `pid`
-  // passed in.
+  // doesn't know about the container.
   //
   // TODO(klueska): In the future we should revisit this to make
   // sure that doing things this way is sufficient (or otherwise
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystem.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystem.cpp
index d9c8fa7..6393bee 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystem.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystem.cpp
@@ -116,13 +116,15 @@ Future<Nothing> Subsystem::recover(
 
 Future<Nothing> Subsystem::prepare(
     const ContainerID& containerId,
-    const string& cgroup)
+    const string& cgroup,
+    const mesos::slave::ContainerConfig& containerConfig)
 {
   return process::dispatch(
       process.get(),
       &SubsystemProcess::prepare,
       containerId,
-      cgroup);
+      cgroup,
+      containerConfig);
 }
 
 
@@ -221,7 +223,8 @@ Future<Nothing> SubsystemProcess::recover(
 
 Future<Nothing> SubsystemProcess::prepare(
     const ContainerID& containerId,
-    const string& cgroup)
+    const string& cgroup,
+    const mesos::slave::ContainerConfig& containerConfig)
 {
   return Nothing();
 }
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystem.hpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystem.hpp
index 088d417..7d33901 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystem.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystem.hpp
@@ -90,11 +90,13 @@ public:
    *
    * @param containerId The target containerId.
    * @param cgroup The target cgroup.
+   * @param containerConfig The container configuration.
    * @return Nothing or an error if `prepare` fails.
    */
   process::Future<Nothing> prepare(
       const ContainerID& containerId,
-      const std::string& cgroup);
+      const std::string& cgroup,
+      const mesos::slave::ContainerConfig& containerConfig);
 
   /**
    * Isolate the associated container to cgroups subsystem.
@@ -198,7 +200,8 @@ public:
 
   virtual process::Future<Nothing> prepare(
       const ContainerID& containerId,
-      const std::string& cgroup);
+      const std::string& cgroup,
+      const mesos::slave::ContainerConfig& containerConfig);
 
   virtual process::Future<Nothing> isolate(
       const ContainerID& containerId,
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.cpp
index ac2e66b..d1de13a 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.cpp
@@ -167,7 +167,8 @@ Future<Nothing> DevicesSubsystemProcess::recover(
 
 Future<Nothing> DevicesSubsystemProcess::prepare(
     const ContainerID& containerId,
-    const string& cgroup)
+    const string& cgroup,
+    const mesos::slave::ContainerConfig& containerConfig)
 {
   if (containerIds.contains(containerId)) {
     return Failure("The subsystem '" + name() + "' has already been prepared");
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.hpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.hpp
index c62deec..8c34c80 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/devices.hpp
@@ -55,7 +55,8 @@ public:
 
   process::Future<Nothing> prepare(
       const ContainerID& containerId,
-      const std::string& cgroup) override;
+      const std::string& cgroup,
+      const mesos::slave::ContainerConfig& containerConfig) override;
 
   process::Future<Nothing> recover(
       const ContainerID& containerId,
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
index 4102985..15f87ba 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.cpp
@@ -32,11 +32,14 @@
 
 #include "common/protobuf_utils.hpp"
 
+#include "slave/containerizer/mesos/utils.hpp"
+
 #include "slave/containerizer/mesos/isolators/cgroups/subsystems/memory.hpp"
 
 using cgroups::memory::pressure::Counter;
 using cgroups::memory::pressure::Level;
 
+using mesos::slave::ContainerConfig;
 using mesos::slave::ContainerLimitation;
 
 using process::Failure;
@@ -135,7 +138,8 @@ Future<Nothing> MemorySubsystemProcess::recover(
 
 Future<Nothing> MemorySubsystemProcess::prepare(
     const ContainerID& containerId,
-    const string& cgroup)
+    const string& cgroup,
+    const ContainerConfig& containerConfig)
 {
   if (infos.contains(containerId)) {
     return Failure("The subsystem '" + name() + "' has already been prepared");
@@ -143,6 +147,7 @@ Future<Nothing> MemorySubsystemProcess::prepare(
 
   infos.put(containerId, Owned<Info>(new Info));
   infos[containerId]->hardLimitUpdated = false;
+  infos[containerId]->isCommandTask = containerConfig.has_task_info();
 
   oomListen(containerId, cgroup);
   pressureListen(containerId, cgroup);
@@ -151,6 +156,86 @@ Future<Nothing> MemorySubsystemProcess::prepare(
 }
 
 
+Future<Nothing> MemorySubsystemProcess::isolate(
+    const ContainerID& containerId,
+    const string& cgroup,
+    pid_t pid)
+{
+  if (!infos.contains(containerId)) {
+    return Failure(
+        "Failed to isolate subsystem '" + name() + "'"
+        ": Unknown container");
+  }
+
+  // Get the soft limit.
+  Try<Bytes> softLimit =
+    cgroups::memory::soft_limit_in_bytes(hierarchy, cgroup);
+
+  if (softLimit.isError()) {
+    return Failure(
+        "Failed to read 'memory.soft_limit_in_bytes'"
+        ": " + softLimit.error());
+  }
+
+  // Get the hard limit.
+  Try<Bytes> hardLimit =
+    cgroups::memory::limit_in_bytes(hierarchy, cgroup);
+
+  if (hardLimit.isError()) {
+    return Failure(
+        "Failed to read 'memory.limit_in_bytes'"
+        ": " + hardLimit.error());
+  }
+
+  // While the OOM score of a process is a complex function of the process state
+  // and configuration, a decent approximation of the OOM score is 10 x percent
+  // of memory used by the process + `/proc/$pid/oom_score_adj` (a configurable
+  // quantity which is between -1000 and 1000). Containers with higher OOM
+  // scores are killed if the system runs out of memory.
+  //
+  // We would like burstable task containers which consume more memory than
+  // their memory requests (i.e. soft limits) to be preferentially OOM-killed
+  // first. To accomplish this, we set their OOM score adjustment as shown
+  // below, which attempts to ensure that the container which consumes more
+  // memory than its memory request will have an OOM score of 1000.
+  //
+  // Please note that there are two kinds of burstable task containers:
+  //   1. Command task containers whose soft limit < hard limit.
+  //   2. Nested task containers whose soft limit < hard limit.
+  //
+  // For any other kinds of containers (see below), we will just leave their OOM
+  // score adjustments at the default value (i.e. 0).
+  //   1. Containers whose soft limit == hard limit, this is to ensure backward
+  //      compatibility.
+  //   2. Default executor containers whose soft limit < hard limit.
+  //   3. Custom executor containers whose soft limit < hard limit.
+  //   4. Debug containers.
+  if (softLimit.get() < hardLimit.get() &&
+      (infos[containerId]->isCommandTask || containerId.has_parent())) {
+    Try<int> oomScoreAdj = calculateOOMScoreAdj(softLimit.get());
+    if (oomScoreAdj.isError()) {
+      return Failure(
+          "Failed to calculate OOM score adjustment: " + oomScoreAdj.error());
+    }
+
+    const string oomScoreAdjPath =
+      strings::format("/proc/%d/oom_score_adj", pid).get();
+
+    Try<Nothing> write =
+      os::write(oomScoreAdjPath, stringify(oomScoreAdj.get()));
+
+    if (write.isError()) {
+      return Failure("Failed to set OOM score adjustment: " + write.error());
+    }
+
+    LOG(INFO) << "Set " << oomScoreAdjPath << " to " << oomScoreAdj.get()
+              << " for container " << containerId;
+  }
+
+  return Nothing();
+}
+
+
 Future<ContainerLimitation> MemorySubsystemProcess::watch(
     const ContainerID& containerId,
     const string& cgroup)
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.hpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.hpp
index ed34df8..a4bbef8 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/memory.hpp
@@ -57,7 +57,13 @@ public:
 
   process::Future<Nothing> prepare(
       const ContainerID& containerId,
-      const std::string& cgroup) override;
+      const std::string& cgroup,
+      const mesos::slave::ContainerConfig& containerConfig) override;
+
+  process::Future<Nothing> isolate(
+      const ContainerID& containerId,
+      const std::string& cgroup,
+      pid_t pid) override;
 
   process::Future<Nothing> recover(
       const ContainerID& containerId,
@@ -97,6 +103,12 @@ private:
     // Indicate whether the memory hard limit of this container has
     // already been updated.
     bool hardLimitUpdated;
+
+    // Indicates whether this is a command task container. Please note
+    // that we only need to use this field in isolating phase, so we do
+    // not recover it after agent restarts, that means its value may not
+    // be correct after agent recovery.
+    bool isCommandTask;
   };
 
   MemorySubsystemProcess(const Flags& flags, const std::string& hierarchy);
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.cpp
index ec2ce67..e140194 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.cpp
@@ -336,7 +336,8 @@ Future<Nothing> NetClsSubsystemProcess::recover(
 
 Future<Nothing> NetClsSubsystemProcess::prepare(
     const ContainerID& containerId,
-    const string& cgroup)
+    const string& cgroup,
+    const mesos::slave::ContainerConfig& containerConfig)
 {
   if (infos.contains(containerId)) {
     return Failure("The subsystem '" + name() + "' has already been prepared");
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.hpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.hpp
index 0653107..c604a2b 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/net_cls.hpp
@@ -150,7 +150,8 @@ public:
 
   process::Future<Nothing> prepare(
       const ContainerID& containerId,
-      const std::string& cgroup) override;
+      const std::string& cgroup,
+      const mesos::slave::ContainerConfig& containerConfig) override;
 
   process::Future<Nothing> isolate(
       const ContainerID& containerId,
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.cpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.cpp
index 180afc9..e88eea4 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.cpp
@@ -115,7 +115,8 @@ Future<Nothing> PerfEventSubsystemProcess::recover(
 
 Future<Nothing> PerfEventSubsystemProcess::prepare(
     const ContainerID& containerId,
-    const string& cgroup)
+    const string& cgroup,
+    const mesos::slave::ContainerConfig& containerConfig)
 {
   if (infos.contains(containerId)) {
     return Failure("The subsystem '" + name() + "' has already been prepared");
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.hpp b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.hpp
index 2c865ac..cac04fe 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.hpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/subsystems/perf_event.hpp
@@ -57,7 +57,8 @@ public:
 
   process::Future<Nothing> prepare(
       const ContainerID& containerId,
-      const std::string& cgroup) override;
+      const std::string& cgroup,
+      const mesos::slave::ContainerConfig& containerConfig) override;
 
   process::Future<Nothing> recover(
       const ContainerID& containerId,
diff --git a/src/slave/containerizer/mesos/utils.cpp b/src/slave/containerizer/mesos/utils.cpp
index d9964f0..970aa59 100644
--- a/src/slave/containerizer/mesos/utils.cpp
+++ b/src/slave/containerizer/mesos/utils.cpp
@@ -140,6 +140,26 @@ Try<pid_t> getMountNamespaceTarget(pid_t parent)
 }
 #endif // __linux__
 
+
+Try<int> calculateOOMScoreAdj(const Bytes& memRequest)
+{
+  // Get the total memory of this node.
+  static Option<Bytes> totalMem;
+  if (totalMem.isNone()) {
+    Try<os::Memory> mem = os::memory();
+    if (mem.isError()) {
+      return Error(
+          "Failed to auto-detect the size of main memory: " + mem.error());
+    }
+
+    totalMem = mem->total;
+  }
+
+  CHECK_SOME(totalMem);
+
+  return 1000 - (1000 * memRequest.bytes()) / totalMem->bytes();
+}
+
 } // namespace slave {
 } // namespace internal {
 } // namespace mesos {
diff --git a/src/slave/containerizer/mesos/utils.hpp b/src/slave/containerizer/mesos/utils.hpp
index bfd07e2..4a31dfd 100644
--- a/src/slave/containerizer/mesos/utils.hpp
+++ b/src/slave/containerizer/mesos/utils.hpp
@@ -30,6 +30,9 @@ namespace slave {
 Try<pid_t> getMountNamespaceTarget(pid_t parent);
 #endif // __linux__
 
+
+Try<int> calculateOOMScoreAdj(const Bytes& memRequest);
+
 } // namespace slave {
 } // namespace internal {
 } // namespace mesos {