You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ji...@apache.org on 2015/06/23 23:31:12 UTC

mesos git commit: Added slave metric to count container launch failures.

Repository: mesos
Updated Branches:
  refs/heads/master 406d72ed0 -> 61e116dd6


Added slave metric to count container launch failures.

Review: https://reviews.apache.org/r/35738


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/61e116dd
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/61e116dd
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/61e116dd

Branch: refs/heads/master
Commit: 61e116dd668c8a5bf2bba12fd908ff2e9c445d23
Parents: 406d72e
Author: Paul Brett <pa...@twopensource.com>
Authored: Tue Jun 23 13:02:04 2015 -0700
Committer: Jie Yu <yu...@gmail.com>
Committed: Tue Jun 23 14:31:02 2015 -0700

----------------------------------------------------------------------
 src/slave/metrics.cpp       |  8 +++++-
 src/slave/metrics.hpp       |  2 ++
 src/slave/slave.cpp         |  4 ++-
 src/tests/containerizer.cpp | 33 ++++++++++-----------
 src/tests/metrics_tests.cpp |  2 ++
 src/tests/slave_tests.cpp   | 62 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 93 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/slave/metrics.cpp
----------------------------------------------------------------------
diff --git a/src/slave/metrics.cpp b/src/slave/metrics.cpp
index 87289f2..ae3a53e 100644
--- a/src/slave/metrics.cpp
+++ b/src/slave/metrics.cpp
@@ -84,7 +84,9 @@ Metrics::Metrics(const Slave& slave)
         "slave/invalid_framework_messages"),
     executor_directory_max_allowed_age_secs(
         "slave/executor_directory_max_allowed_age_secs",
-        defer(slave, &Slave::_executor_directory_max_allowed_age_secs))
+        defer(slave, &Slave::_executor_directory_max_allowed_age_secs)),
+    container_launch_errors(
+        "slave/container_launch_errors")
 {
   // TODO(dhamon): Check return values for metric registration.
   process::metrics::add(uptime_secs);
@@ -115,6 +117,8 @@ Metrics::Metrics(const Slave& slave)
 
   process::metrics::add(executor_directory_max_allowed_age_secs);
 
+  process::metrics::add(container_launch_errors);
+
   // Create resource gauges.
   // TODO(dhamon): Set these up dynamically when creating a slave
   // based on the resources it exposes.
@@ -197,6 +201,8 @@ Metrics::~Metrics()
 
   process::metrics::remove(executor_directory_max_allowed_age_secs);
 
+  process::metrics::remove(container_launch_errors);
+
   foreach (const Gauge& gauge, resources_total) {
     process::metrics::remove(gauge);
   }

http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/slave/metrics.hpp
----------------------------------------------------------------------
diff --git a/src/slave/metrics.hpp b/src/slave/metrics.hpp
index 780c438..43c8662 100644
--- a/src/slave/metrics.hpp
+++ b/src/slave/metrics.hpp
@@ -65,6 +65,8 @@ struct Metrics
 
   process::metrics::Gauge executor_directory_max_allowed_age_secs;
 
+  process::metrics::Counter container_launch_errors;
+
   // Non-revocable resources.
   std::vector<process::metrics::Gauge> resources_total;
   std::vector<process::metrics::Gauge> resources_used;

http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 946e372..b3e1ccc 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -3197,8 +3197,9 @@ void Slave::executorLaunched(
                << "' failed to start: "
                << (future.isFailed() ? future.failure() : " future discarded");
 
-    containerizer->destroy(containerId);
+    ++metrics.container_launch_errors;
 
+    containerizer->destroy(containerId);
     return;
   } else if (!future.get()) {
     LOG(ERROR) << "Container '" << containerId
@@ -3208,6 +3209,7 @@ void Slave::executorLaunched(
                << flags.containerizers << ") could create a container for the "
                << "provided TaskInfo/ExecutorInfo message.";
 
+     ++metrics.container_launch_errors;
     return;
   }
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/tests/containerizer.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer.cpp b/src/tests/containerizer.cpp
index 80b9105..68c6f98 100644
--- a/src/tests/containerizer.cpp
+++ b/src/tests/containerizer.cpp
@@ -182,7 +182,7 @@ void TestContainerizer::destroy(
   std::pair<FrameworkID, ExecutorID> key(frameworkId, executorId);
   if (!containers_.contains(key)) {
     LOG(WARNING) << "Ignoring destroy of unknown container for executor '"
-                  << executorId << "' of framework '" << frameworkId << "'";
+                 << executorId << "' of framework " << frameworkId;
     return;
   }
   destroy(containers_[key]);
@@ -191,21 +191,22 @@ void TestContainerizer::destroy(
 
 void TestContainerizer::destroy(const ContainerID& containerId)
 {
-  CHECK(drivers.contains(containerId))
-    << "Failed to terminate container " << containerId
-    << " because it is has not been started";
-
-  Owned<MesosExecutorDriver> driver = drivers[containerId];
-  driver->stop();
-  driver->join();
-  drivers.erase(containerId);
-
-  containerizer::Termination termination;
-  termination.set_killed(false);
-  termination.set_message("Killed executor");
-  termination.set_status(0);
-  promises[containerId]->set(termination);
-  promises.erase(containerId);
+  if (drivers.contains(containerId)) {
+    Owned<MesosExecutorDriver> driver = drivers[containerId];
+    driver->stop();
+    driver->join();
+    drivers.erase(containerId);
+  }
+
+  if (promises.contains(containerId)) {
+    containerizer::Termination termination;
+    termination.set_killed(false);
+    termination.set_message("Killed executor");
+    termination.set_status(0);
+
+    promises[containerId]->set(termination);
+    promises.erase(containerId);
+  }
 }
 
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/tests/metrics_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/metrics_tests.cpp b/src/tests/metrics_tests.cpp
index 6896727..3e9d7c2 100644
--- a/src/tests/metrics_tests.cpp
+++ b/src/tests/metrics_tests.cpp
@@ -201,6 +201,8 @@ TEST_F(MetricsTest, Slave)
   EXPECT_EQ(1u, stats.values.count("slave/valid_framework_messages"));
   EXPECT_EQ(1u, stats.values.count("slave/invalid_framework_messages"));
 
+  EXPECT_EQ(1u, stats.values.count("slave/container_launch_errors"));
+
   EXPECT_EQ(1u, stats.values.count("slave/cpus_total"));
   EXPECT_EQ(1u, stats.values.count("slave/cpus_used"));
   EXPECT_EQ(1u, stats.values.count("slave/cpus_percent"));

http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 5030198..027da84 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -911,6 +911,8 @@ TEST_F(SlaveTest, MetricsInMetricsEndpoint)
       1u,
       snapshot.values.count("slave/executor_directory_max_allowed_age_secs"));
 
+  EXPECT_EQ(1u, snapshot.values.count("slave/container_launch_errors"));
+
   EXPECT_EQ(1u, snapshot.values.count("slave/cpus_total"));
   EXPECT_EQ(1u, snapshot.values.count("slave/cpus_used"));
   EXPECT_EQ(1u, snapshot.values.count("slave/cpus_percent"));
@@ -927,6 +929,66 @@ TEST_F(SlaveTest, MetricsInMetricsEndpoint)
 }
 
 
+// Test to verify that we increment the container launch errors metric
+// when we fail to launch a container.
+TEST_F(SlaveTest, MetricsSlaveLaunchErrors)
+{
+  // Start a master.
+  Try<PID<Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  TestContainerizer containerizer;
+
+  // Start a slave.
+  Try<PID<Slave>> slave = StartSlave(&containerizer);
+  ASSERT_SOME(slave);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(_, _, _));
+
+  Future<vector<Offer>> offers;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  driver.start();
+
+  AWAIT_READY(offers);
+  EXPECT_NE(0u, offers.get().size());
+  const Offer offer = offers.get()[0];
+
+  // Verify that we start with no launch failures.
+  JSON::Object snapshot = Metrics();
+  EXPECT_EQ(0, snapshot.values["slave/container_launch_errors"]);
+
+  EXPECT_CALL(containerizer, launch(_, _, _, _, _, _, _))
+    .WillOnce(Return(Failure("Injected failure")));
+
+  EXPECT_CALL(sched, statusUpdate(&driver, _));
+
+  // Try to start a task
+  TaskInfo task = createTask(
+      offer.slave_id(),
+      Resources::parse("cpus:1;mem:32").get(),
+      "sleep 1000",
+      DEFAULT_EXECUTOR_ID);
+
+  driver.launchTasks(offer.id(), {task});
+
+  // After failure injection, metrics should report a single failure.
+  snapshot = Metrics();
+  EXPECT_EQ(1, snapshot.values["slave/container_launch_errors"]);
+
+  driver.stop();
+  driver.join();
+
+  Shutdown();
+}
+
+
 TEST_F(SlaveTest, StateEndpoint)
 {
   Try<PID<Master>> master = StartMaster();