You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by nn...@apache.org on 2015/12/16 22:22:24 UTC

[1/2] mesos git commit: Added description of the LoadQoSController in the oversubscription.md.

Repository: mesos
Updated Branches:
  refs/heads/master 00983c1fa -> eec326e5e


Added description of the LoadQoSController in the oversubscription.md.

Added description to _Writing a custom QoS controller_ and _Configuring
oversubscription_ sections.

Review: https://reviews.apache.org/r/41042/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/eec326e5
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/eec326e5
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/eec326e5

Branch: refs/heads/master
Commit: eec326e5edb76465bb38b528aa6da4dac8814e6d
Parents: 5dadef2
Author: Bartek Plotka <bw...@gmail.com>
Authored: Wed Dec 16 11:17:32 2015 -0800
Committer: Niklas Q. Nielsen <ni...@qni.dk>
Committed: Wed Dec 16 13:22:06 2015 -0800

----------------------------------------------------------------------
 docs/oversubscription.md | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/eec326e5/docs/oversubscription.md
----------------------------------------------------------------------
diff --git a/docs/oversubscription.md b/docs/oversubscription.md
index 7d1415a..0b1c20b 100644
--- a/docs/oversubscription.md
+++ b/docs/oversubscription.md
@@ -197,6 +197,16 @@ The QoS Controller informs the slave that particular corrective actions need to
 be made. Each corrective action contains information about executor or task and
 the type of action to perform.
 
+Mesos comes with a `noop` and a `load` qos controller. The `noop` controller
+does not provide any corrections, thus does not assure any quality of service
+for regular tasks. The `load` controller is ensuring the total system load
+doesn't exceed a configurable thresholds and as a result try to avoid the cpu
+congestion on the node. If the load is above the thresholds controller evicts
+all the revocable executors. These thresholds are configurable via two module
+parameters `load_threshold_5min` and `load_threshold_15min`. They represent
+standard unix load averages in the system. 1 minute system load is ignored,
+since for oversubscription use case it can be a misleading signal.
+
 ~~~{.proto}
 message QoSCorrection {
   enum Type {
@@ -293,5 +303,37 @@ The `fixed` resource estimator is enabled as follows:
 In the example above, a fixed amount of 14 cpus will be offered as revocable
 resources.
 
+The `load` qos controller is enabled as follows:
+
+```
+--qos_controller="org_apache_mesos_LoadQoSController"
+
+--qos_correction_interval_min="20secs"
+
+--modules='{
+  "libraries": {
+    "file": "/usr/local/lib64/libload_qos_controller.so",
+    "modules": {
+      "name": "org_apache_mesos_LoadQoSController",
+      "parameters": [
+        {
+          "key": "load_threshold_5min",
+          "value": "6"
+        },
+        {
+	  "key": "load_threshold_15min",
+	  "value": "4"
+        }
+      ]
+    }
+  }
+}'
+```
+
+In the example above, when standard unix system load average for 5 minutes will
+be above 6, or for 15 minutes will be above 4 then slave will evict all the
+`revocable` executors. `LoadQoSController` will be effectively run every 20
+seconds.
+
 To install a custom resource estimator and QoS controller, please refer to the
 [modules documentation](modules.md).


[2/2] mesos git commit: Added Load QoS Controller for simple eviction when system load is above configured threshold.

Posted by nn...@apache.org.
Added Load QoS Controller for simple eviction when system load is above configured threshold.

Added Load QoS Controller for the simple eviction when system load is
above configured system load threshold for 5min and 15min:
- Made os::loadavg called from the lambda and passed via the contructor.
- Added unit test.

Review: https://reviews.apache.org/r/40617/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/5dadef2a
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/5dadef2a
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/5dadef2a

Branch: refs/heads/master
Commit: 5dadef2ae805832d0cf8738bd8e41a1884885f2c
Parents: 00983c1
Author: Bartek Plotka <bw...@gmail.com>
Authored: Wed Dec 16 11:17:16 2015 -0800
Committer: Niklas Q. Nielsen <ni...@qni.dk>
Committed: Wed Dec 16 13:22:06 2015 -0800

----------------------------------------------------------------------
 src/Makefile.am                      |   8 ++
 src/slave/qos_controllers/load.cpp   | 222 ++++++++++++++++++++++++++++++
 src/slave/qos_controllers/load.hpp   |  76 ++++++++++
 src/tests/oversubscription_tests.cpp |  96 +++++++++++++
 4 files changed, 402 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/5dadef2a/src/Makefile.am
----------------------------------------------------------------------
diff --git a/src/Makefile.am b/src/Makefile.am
index 8f6b98b..e6d48dc 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1602,6 +1602,13 @@ libfixed_resource_estimator_la_SOURCES = slave/resource_estimators/fixed.cpp
 libfixed_resource_estimator_la_CPPFLAGS = $(MESOS_CPPFLAGS)
 libfixed_resource_estimator_la_LDFLAGS = $(MESOS_MODULE_LDFLAGS)
 
+# Library containing the load qos controller.
+lib_LTLIBRARIES += libload_qos_controller.la
+libload_qos_controller_la_SOURCES = slave/qos_controllers/load.hpp
+libload_qos_controller_la_SOURCES += slave/qos_controllers/load.cpp
+libload_qos_controller_la_CPPFLAGS = $(MESOS_CPPFLAGS)
+libload_qos_controller_la_LDFLAGS = $(MESOS_MODULE_LDFLAGS)
+
 # We need to build the test module libraries for running the test suite but
 # don't need to install them.  The 'noinst_' prefix ensures that these libraries
 # will not be installed.  However, it also skips building the shared libraries.
@@ -1669,6 +1676,7 @@ libtestqos_controller_la_CPPFLAGS = $(MESOS_CPPFLAGS)
 libtestqos_controller_la_LDFLAGS = $(MESOS_TEST_MODULE_LDFLAGS)
 
 mesos_tests_SOURCES =						\
+  slave/qos_controllers/load.cpp				\
   tests/anonymous_tests.cpp					\
   tests/attributes_tests.cpp					\
   tests/authentication_tests.cpp				\

http://git-wip-us.apache.org/repos/asf/mesos/blob/5dadef2a/src/slave/qos_controllers/load.cpp
----------------------------------------------------------------------
diff --git a/src/slave/qos_controllers/load.cpp b/src/slave/qos_controllers/load.cpp
new file mode 100644
index 0000000..52520d6
--- /dev/null
+++ b/src/slave/qos_controllers/load.cpp
@@ -0,0 +1,222 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <list>
+#include <string>
+
+#include <mesos/module/qos_controller.hpp>
+
+#include <mesos/slave/qos_controller.hpp>
+
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+#include <process/owned.hpp>
+#include <process/process.hpp>
+
+#include <stout/lambda.hpp>
+#include <stout/numify.hpp>
+#include <stout/option.hpp>
+#include <stout/os/os.hpp>
+#include <stout/posix/os.hpp>
+#include <stout/result.hpp>
+
+#include "slave/qos_controllers/load.hpp"
+
+using namespace mesos;
+using namespace process;
+
+using std::list;
+using std::string;
+
+using mesos::modules::Module;
+
+using mesos::slave::QoSController;
+using mesos::slave::QoSCorrection;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+
+class LoadQoSControllerProcess : public Process<LoadQoSControllerProcess>
+{
+public:
+  LoadQoSControllerProcess(
+      const lambda::function<Future<ResourceUsage>()>& _usage,
+      const lambda::function<Try<os::Load>()>& _loadAverage,
+      const Option<double>& _loadThreshold5Min,
+      const Option<double>& _loadThreshold15Min)
+    : usage(_usage),
+      loadAverage(_loadAverage),
+      loadThreshold5Min(_loadThreshold5Min),
+      loadThreshold15Min(_loadThreshold15Min) {}
+
+  Future<std::list<QoSCorrection>> corrections()
+  {
+    return usage().then(defer(self(), &Self::_corrections, lambda::_1));
+  }
+
+  Future<std::list<QoSCorrection>> _corrections(const ResourceUsage& usage)
+  {
+    Try<os::Load> load = loadAverage();
+    if (load.isError()) {
+      LOG(ERROR) << "Failed to fetch system load: " + load.error();
+      return std::list<QoSCorrection>();
+    }
+
+    bool overloaded = false;
+
+    if (loadThreshold5Min.isSome()) {
+      if (load.get().five > loadThreshold5Min.get()) {
+        LOG(INFO) << "System 5 minutes load average " << load.get().five
+                  << " exceeds threshold " << loadThreshold5Min.get();
+        overloaded = true;
+      }
+    }
+
+    if (loadThreshold15Min.isSome()) {
+      if (load.get().fifteen > loadThreshold15Min.get()) {
+        LOG(INFO) << "System 15 minutes load average " << load.get().fifteen
+                  << " exceeds threshold " << loadThreshold15Min.get();
+        overloaded = true;
+      }
+    }
+
+    if (overloaded) {
+      std::list<QoSCorrection> corrections;
+
+      for (const ResourceUsage::Executor& executor : usage.executors()) {
+        // Set kill correction for all revocable executors.
+        if (!Resources(executor.allocated()).revocable().empty()) {
+          QoSCorrection correction;
+
+          correction.set_type(mesos::slave::QoSCorrection_Type_KILL);
+          correction.mutable_kill()->mutable_framework_id()->CopyFrom(
+            executor.executor_info().framework_id());
+          correction.mutable_kill()->mutable_executor_id()->CopyFrom(
+            executor.executor_info().executor_id());
+
+          corrections.push_back(correction);
+        }
+      }
+
+      return corrections;
+    }
+
+    return std::list<QoSCorrection>();
+  }
+
+private:
+  const lambda::function<Future<ResourceUsage>()> usage;
+  const lambda::function<Try<os::Load>()> loadAverage;
+  const Option<double> loadThreshold5Min;
+  const Option<double> loadThreshold15Min;
+};
+
+
+LoadQoSController::~LoadQoSController()
+{
+  if (process.get() != NULL) {
+    terminate(process.get());
+    wait(process.get());
+  }
+}
+
+
+Try<Nothing> LoadQoSController::initialize(
+  const lambda::function<Future<ResourceUsage>()>& usage)
+{
+  if (process.get() != NULL) {
+    return Error("Load QoS Controller has already been initialized");
+  }
+
+  process.reset(
+      new LoadQoSControllerProcess(
+          usage,
+          loadAverage,
+          loadThreshold5Min,
+          loadThreshold15Min));
+
+  spawn(process.get());
+
+  return Nothing();
+}
+
+
+process::Future<std::list<QoSCorrection>> LoadQoSController::corrections()
+{
+  if (process.get() == NULL) {
+    return Failure("Load QoS Controller is not initialized");
+  }
+
+  return dispatch(
+      process.get(),
+      &LoadQoSControllerProcess::corrections);
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+
+static QoSController* create(const Parameters& parameters)
+{
+  // Obtain the system load threshold from parameters.
+  Option<double> loadThreshold5Min = None();
+  Option<double> loadThreshold15Min = None();
+
+  for (const Parameter& parameter : parameters.parameter()) {
+    if (parameter.key() == "load_threshold_5min") {
+      // Try to parse the load 5min value.
+      Try<double> thresholdParam = numify<double>(parameter.value());
+      if (thresholdParam.isError()) {
+        LOG(ERROR) << "Failed to parse 5 min load threshold: "
+                   << thresholdParam.error();
+        return NULL;
+      }
+
+      loadThreshold5Min = thresholdParam.get();
+    } else if (parameter.key() == "load_threshold_15min") {
+      // Try to parse the load 15min value.
+      Try<double> thresholdParam = numify<double>(parameter.value());
+      if (thresholdParam.isError()) {
+        LOG(ERROR) << "Failed to parse 15 min load threshold: "
+                   << thresholdParam.error();
+        return NULL;
+      }
+
+      loadThreshold15Min = thresholdParam.get();
+    }
+  }
+
+  if (loadThreshold5Min.isNone() && loadThreshold15Min.isNone()) {
+    LOG(ERROR) << "No load thresholds are configured for LoadQoSController";
+    return NULL;
+  }
+
+  return new mesos::internal::slave::LoadQoSController(
+      loadThreshold5Min, loadThreshold15Min);
+}
+
+
+Module<QoSController> org_apache_mesos_LoadQoSController(
+    MESOS_MODULE_API_VERSION,
+    MESOS_VERSION,
+    "Apache Mesos",
+    "modules@mesos.apache.org",
+    "System Load QoS Controller Module.",
+    NULL,
+    create);

http://git-wip-us.apache.org/repos/asf/mesos/blob/5dadef2a/src/slave/qos_controllers/load.hpp
----------------------------------------------------------------------
diff --git a/src/slave/qos_controllers/load.hpp b/src/slave/qos_controllers/load.hpp
new file mode 100644
index 0000000..098a6d0
--- /dev/null
+++ b/src/slave/qos_controllers/load.hpp
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __SLAVE_QOS_CONTROLLERS_LOAD_HPP__
+#define __SLAVE_QOS_CONTROLLERS_LOAD_HPP__
+
+#include <list>
+
+#include <mesos/slave/qos_controller.hpp>
+
+#include <stout/lambda.hpp>
+#include <stout/os/os.hpp>
+#include <stout/try.hpp>
+
+#include <process/future.hpp>
+#include <process/owned.hpp>
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// Forward declaration.
+class LoadQoSControllerProcess;
+
+
+// The `LoadQoSController` is a simple QoS Controller, which is
+// responsible for eviction of all the revocable executors when
+// system load (5min or 15min) is above the configured threshold.
+// NOTE: 1 minute system load is ignored, because
+// for most use cases it is a misleading signal.
+class LoadQoSController : public mesos::slave::QoSController
+{
+public:
+  // NOTE: In constructor we can pass lambda for fetching load average as
+  // an optional argument. This was done for the test purposes.
+  LoadQoSController(
+      const Option<double>& _loadThreshold5Min,
+      const Option<double>& _loadThreshold15Min,
+      const lambda::function<Try<os::Load>()>& _loadAverage =
+        [](){ return os::loadavg(); })
+    : loadThreshold5Min(_loadThreshold5Min),
+      loadThreshold15Min(_loadThreshold15Min),
+      loadAverage(_loadAverage) {}
+
+  virtual ~LoadQoSController();
+
+  virtual Try<Nothing> initialize(
+    const lambda::function<process::Future<ResourceUsage>()>& usage);
+
+  virtual process::Future<std::list<mesos::slave::QoSCorrection>> corrections();
+
+private:
+  const Option<double> loadThreshold5Min;
+  const Option<double> loadThreshold15Min;
+  const lambda::function<Try<os::Load>()> loadAverage;
+  process::Owned<LoadQoSControllerProcess> process;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __SLAVE_QOS_CONTROLLERS_LOAD_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/5dadef2a/src/tests/oversubscription_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/oversubscription_tests.cpp b/src/tests/oversubscription_tests.cpp
index 0333281..7a75fb3 100644
--- a/src/tests/oversubscription_tests.cpp
+++ b/src/tests/oversubscription_tests.cpp
@@ -43,6 +43,7 @@
 #include "slave/flags.hpp"
 #include "slave/monitor.hpp"
 #include "slave/slave.hpp"
+#include "slave/qos_controllers/load.hpp"
 
 #include "tests/flags.hpp"
 #include "tests/containerizer.hpp"
@@ -53,6 +54,7 @@ using namespace process;
 
 using mesos::internal::master::Master;
 
+using mesos::internal::slave::LoadQoSController;
 using mesos::internal::slave::ResourceMonitor;
 using mesos::internal::slave::Slave;
 
@@ -156,6 +158,23 @@ protected:
     return statistics;
   }
 
+  ExecutorInfo createExecutorInfo(
+      const string& _frameworkId,
+      const string& _executorId)
+  {
+    FrameworkID frameworkId;
+    frameworkId.set_value(_frameworkId);
+
+    ExecutorID executorId;
+    executorId.set_value(_executorId);
+
+    ExecutorInfo executorInfo;
+    executorInfo.mutable_executor_id()->CopyFrom(executorId);
+    executorInfo.mutable_framework_id()->CopyFrom(frameworkId);
+
+    return executorInfo;
+  }
+
 private:
   string originalLDLibraryPath;
   Modules modules;
@@ -1105,6 +1124,83 @@ TEST_F(OversubscriptionTest, RemoveCapabilitiesOnSchedulerFailover)
   Shutdown();
 }
 
+
+// This test verifies the functionality of the Load QoS Controller.
+// If the total system load on the agent exceeds the configured threshold then
+// it should evict all revocable executors.
+// 1. Run first correction iteration with two revocable executors and the system
+//    load below the thresholds. Eviction should not appear.
+// 2. Run second correction iteration with the same executors and the system
+//    5min load above the threshold. QoSCorrection message should appear.
+TEST_F(OversubscriptionTest, LoadQoSController)
+{
+  // Configure Load QoS Controller. Revocable tasks will be killed when
+  // the load 5min value will be above 7 or load 15min above 6.
+  // This configuration could be a reasonable one for an 8 CPUs machine.
+  const double loadThreshold5Min = 7;
+  const double loadThreshold15Min = 6;
+
+  // Prepare stubbed os::Load whose values are below thresholds.
+  os::Load stubLoad;
+
+  stubLoad.one = 1;
+  stubLoad.five = loadThreshold5Min - 0.2;
+  stubLoad.fifteen = loadThreshold15Min - 0.2;
+
+  // Construct `LoadQoSController` with configured thresholds and fake
+  // loadAverage lambda.
+  LoadQoSController controller(loadThreshold5Min,
+                               loadThreshold15Min,
+                               [&stubLoad]() { return stubLoad; });
+
+  // Prepare lambda creating ResourceUsage stub with two revocable executors.
+  controller.initialize([this]() -> Future<ResourceUsage> {
+    ResourceUsage usage;
+    ResourceStatistics statistics = createResourceStatistics();
+
+    Resources resources = Resources::parse("mem:128").get();
+    resources += createRevocableResources("cpus", "1");
+
+    // Prepare first revocable executor.
+    ResourceUsage::Executor* executor = usage.add_executors();
+    executor->mutable_executor_info()->CopyFrom(
+        createExecutorInfo("framework", "executor1"));
+    executor->mutable_allocated()->CopyFrom(resources);
+    executor->mutable_statistics()->CopyFrom(statistics);
+
+    // Prepare second revocable executor.
+    resources = Resources::parse("mem:256").get();
+    resources += createRevocableResources("cpus", "7");
+
+    executor = usage.add_executors();
+    executor->mutable_executor_info()->CopyFrom(
+        createExecutorInfo("framework", "executor2"));
+    executor->mutable_allocated()->CopyFrom(resources);
+    executor->mutable_statistics()->CopyFrom(statistics);
+
+    return usage;
+  });
+
+  // First correction iteration. All system loads are below the threshold.
+  Future<list<QoSCorrection>> qosCorrections = controller.corrections();
+
+  AWAIT(qosCorrections);
+
+  // Expect no corrections.
+  ASSERT_EQ(qosCorrections.get().size(), 0u);
+
+  // Second correction iteration. Make system 5 minutes load above the
+  // threshold.
+  stubLoad.five = loadThreshold5Min + 0.2;
+  qosCorrections = controller.corrections();
+
+  AWAIT(qosCorrections);
+
+  // Expect two corrections, since there were two revocable executors.
+  ASSERT_EQ(qosCorrections.get().size(), 2u);
+}
+
+
 } // namespace tests {
 } // namespace internal {
 } // namespace mesos {