You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ji...@apache.org on 2015/06/23 23:31:12 UTC
mesos git commit: Added slave metric to count container launch
failures.
Repository: mesos
Updated Branches:
refs/heads/master 406d72ed0 -> 61e116dd6
Added slave metric to count container launch failures.
Review: https://reviews.apache.org/r/35738
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/61e116dd
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/61e116dd
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/61e116dd
Branch: refs/heads/master
Commit: 61e116dd668c8a5bf2bba12fd908ff2e9c445d23
Parents: 406d72e
Author: Paul Brett <pa...@twopensource.com>
Authored: Tue Jun 23 13:02:04 2015 -0700
Committer: Jie Yu <yu...@gmail.com>
Committed: Tue Jun 23 14:31:02 2015 -0700
----------------------------------------------------------------------
src/slave/metrics.cpp | 8 +++++-
src/slave/metrics.hpp | 2 ++
src/slave/slave.cpp | 4 ++-
src/tests/containerizer.cpp | 33 ++++++++++-----------
src/tests/metrics_tests.cpp | 2 ++
src/tests/slave_tests.cpp | 62 ++++++++++++++++++++++++++++++++++++++++
6 files changed, 93 insertions(+), 18 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/slave/metrics.cpp
----------------------------------------------------------------------
diff --git a/src/slave/metrics.cpp b/src/slave/metrics.cpp
index 87289f2..ae3a53e 100644
--- a/src/slave/metrics.cpp
+++ b/src/slave/metrics.cpp
@@ -84,7 +84,9 @@ Metrics::Metrics(const Slave& slave)
"slave/invalid_framework_messages"),
executor_directory_max_allowed_age_secs(
"slave/executor_directory_max_allowed_age_secs",
- defer(slave, &Slave::_executor_directory_max_allowed_age_secs))
+ defer(slave, &Slave::_executor_directory_max_allowed_age_secs)),
+ container_launch_errors(
+ "slave/container_launch_errors")
{
// TODO(dhamon): Check return values for metric registration.
process::metrics::add(uptime_secs);
@@ -115,6 +117,8 @@ Metrics::Metrics(const Slave& slave)
process::metrics::add(executor_directory_max_allowed_age_secs);
+ process::metrics::add(container_launch_errors);
+
// Create resource gauges.
// TODO(dhamon): Set these up dynamically when creating a slave
// based on the resources it exposes.
@@ -197,6 +201,8 @@ Metrics::~Metrics()
process::metrics::remove(executor_directory_max_allowed_age_secs);
+ process::metrics::remove(container_launch_errors);
+
foreach (const Gauge& gauge, resources_total) {
process::metrics::remove(gauge);
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/slave/metrics.hpp
----------------------------------------------------------------------
diff --git a/src/slave/metrics.hpp b/src/slave/metrics.hpp
index 780c438..43c8662 100644
--- a/src/slave/metrics.hpp
+++ b/src/slave/metrics.hpp
@@ -65,6 +65,8 @@ struct Metrics
process::metrics::Gauge executor_directory_max_allowed_age_secs;
+ process::metrics::Counter container_launch_errors;
+
// Non-revocable resources.
std::vector<process::metrics::Gauge> resources_total;
std::vector<process::metrics::Gauge> resources_used;
http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/slave/slave.cpp
----------------------------------------------------------------------
diff --git a/src/slave/slave.cpp b/src/slave/slave.cpp
index 946e372..b3e1ccc 100644
--- a/src/slave/slave.cpp
+++ b/src/slave/slave.cpp
@@ -3197,8 +3197,9 @@ void Slave::executorLaunched(
<< "' failed to start: "
<< (future.isFailed() ? future.failure() : " future discarded");
- containerizer->destroy(containerId);
+ ++metrics.container_launch_errors;
+ containerizer->destroy(containerId);
return;
} else if (!future.get()) {
LOG(ERROR) << "Container '" << containerId
@@ -3208,6 +3209,7 @@ void Slave::executorLaunched(
<< flags.containerizers << ") could create a container for the "
<< "provided TaskInfo/ExecutorInfo message.";
+ ++metrics.container_launch_errors;
return;
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/tests/containerizer.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer.cpp b/src/tests/containerizer.cpp
index 80b9105..68c6f98 100644
--- a/src/tests/containerizer.cpp
+++ b/src/tests/containerizer.cpp
@@ -182,7 +182,7 @@ void TestContainerizer::destroy(
std::pair<FrameworkID, ExecutorID> key(frameworkId, executorId);
if (!containers_.contains(key)) {
LOG(WARNING) << "Ignoring destroy of unknown container for executor '"
- << executorId << "' of framework '" << frameworkId << "'";
+ << executorId << "' of framework " << frameworkId;
return;
}
destroy(containers_[key]);
@@ -191,21 +191,22 @@ void TestContainerizer::destroy(
void TestContainerizer::destroy(const ContainerID& containerId)
{
- CHECK(drivers.contains(containerId))
- << "Failed to terminate container " << containerId
- << " because it is has not been started";
-
- Owned<MesosExecutorDriver> driver = drivers[containerId];
- driver->stop();
- driver->join();
- drivers.erase(containerId);
-
- containerizer::Termination termination;
- termination.set_killed(false);
- termination.set_message("Killed executor");
- termination.set_status(0);
- promises[containerId]->set(termination);
- promises.erase(containerId);
+ if (drivers.contains(containerId)) {
+ Owned<MesosExecutorDriver> driver = drivers[containerId];
+ driver->stop();
+ driver->join();
+ drivers.erase(containerId);
+ }
+
+ if (promises.contains(containerId)) {
+ containerizer::Termination termination;
+ termination.set_killed(false);
+ termination.set_message("Killed executor");
+ termination.set_status(0);
+
+ promises[containerId]->set(termination);
+ promises.erase(containerId);
+ }
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/tests/metrics_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/metrics_tests.cpp b/src/tests/metrics_tests.cpp
index 6896727..3e9d7c2 100644
--- a/src/tests/metrics_tests.cpp
+++ b/src/tests/metrics_tests.cpp
@@ -201,6 +201,8 @@ TEST_F(MetricsTest, Slave)
EXPECT_EQ(1u, stats.values.count("slave/valid_framework_messages"));
EXPECT_EQ(1u, stats.values.count("slave/invalid_framework_messages"));
+ EXPECT_EQ(1u, stats.values.count("slave/container_launch_errors"));
+
EXPECT_EQ(1u, stats.values.count("slave/cpus_total"));
EXPECT_EQ(1u, stats.values.count("slave/cpus_used"));
EXPECT_EQ(1u, stats.values.count("slave/cpus_percent"));
http://git-wip-us.apache.org/repos/asf/mesos/blob/61e116dd/src/tests/slave_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/slave_tests.cpp b/src/tests/slave_tests.cpp
index 5030198..027da84 100644
--- a/src/tests/slave_tests.cpp
+++ b/src/tests/slave_tests.cpp
@@ -911,6 +911,8 @@ TEST_F(SlaveTest, MetricsInMetricsEndpoint)
1u,
snapshot.values.count("slave/executor_directory_max_allowed_age_secs"));
+ EXPECT_EQ(1u, snapshot.values.count("slave/container_launch_errors"));
+
EXPECT_EQ(1u, snapshot.values.count("slave/cpus_total"));
EXPECT_EQ(1u, snapshot.values.count("slave/cpus_used"));
EXPECT_EQ(1u, snapshot.values.count("slave/cpus_percent"));
@@ -927,6 +929,66 @@ TEST_F(SlaveTest, MetricsInMetricsEndpoint)
}
+// Test to verify that we increment the container launch errors metric
+// when we fail to launch a container.
+TEST_F(SlaveTest, MetricsSlaveLaunchErrors)
+{
+ // Start a master.
+ Try<PID<Master>> master = StartMaster();
+ ASSERT_SOME(master);
+
+ TestContainerizer containerizer;
+
+ // Start a slave.
+ Try<PID<Slave>> slave = StartSlave(&containerizer);
+ ASSERT_SOME(slave);
+
+ MockScheduler sched;
+ MesosSchedulerDriver driver(
+ &sched, DEFAULT_FRAMEWORK_INFO, master.get(), DEFAULT_CREDENTIAL);
+
+ EXPECT_CALL(sched, registered(_, _, _));
+
+ Future<vector<Offer>> offers;
+ EXPECT_CALL(sched, resourceOffers(&driver, _))
+ .WillOnce(FutureArg<1>(&offers))
+ .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+ driver.start();
+
+ AWAIT_READY(offers);
+ EXPECT_NE(0u, offers.get().size());
+ const Offer offer = offers.get()[0];
+
+ // Verify that we start with no launch failures.
+ JSON::Object snapshot = Metrics();
+ EXPECT_EQ(0, snapshot.values["slave/container_launch_errors"]);
+
+ EXPECT_CALL(containerizer, launch(_, _, _, _, _, _, _))
+ .WillOnce(Return(Failure("Injected failure")));
+
+ EXPECT_CALL(sched, statusUpdate(&driver, _));
+
+ // Try to start a task
+ TaskInfo task = createTask(
+ offer.slave_id(),
+ Resources::parse("cpus:1;mem:32").get(),
+ "sleep 1000",
+ DEFAULT_EXECUTOR_ID);
+
+ driver.launchTasks(offer.id(), {task});
+
+ // After failure injection, metrics should report a single failure.
+ snapshot = Metrics();
+ EXPECT_EQ(1, snapshot.values["slave/container_launch_errors"]);
+
+ driver.stop();
+ driver.join();
+
+ Shutdown();
+}
+
+
TEST_F(SlaveTest, StateEndpoint)
{
Try<PID<Master>> master = StartMaster();