You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by gr...@apache.org on 2018/08/01 20:24:29 UTC

[12/16] mesos git commit: Tested per-framework task state metrics.

Tested per-framework task state metrics.

This patch adds `MasterTest.TaskStateMetrics`, which verifies that
per-framework task state metrics for both terminal and active task
states report correct values, even after agent reregistration.

Review: https://reviews.apache.org/r/67187/


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f446775a
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f446775a
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f446775a

Branch: refs/heads/master
Commit: f446775aa45c6e7e489da0078b483048bcc881f1
Parents: 7754e86
Author: Greg Mann <gr...@mesosphere.io>
Authored: Wed Aug 1 07:59:14 2018 -0700
Committer: Greg Mann <gr...@gmail.com>
Committed: Wed Aug 1 13:07:55 2018 -0700

----------------------------------------------------------------------
 src/tests/master_tests.cpp | 231 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 231 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/f446775a/src/tests/master_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/master_tests.cpp b/src/tests/master_tests.cpp
index 91ce81b..5accfbb 100644
--- a/src/tests/master_tests.cpp
+++ b/src/tests/master_tests.cpp
@@ -115,6 +115,7 @@ using std::string;
 using std::vector;
 
 using testing::_;
+using testing::AllOf;
 using testing::AtMost;
 using testing::DoAll;
 using testing::Eq;
@@ -9107,6 +9108,236 @@ TEST_F(MasterTest, DropOperationWithIDAffectingDefaultResources)
 }
 
 
+// Verifies that both active and terminal per-framework task state metrics
+// report correct values, even after agent reregistration.
+TEST_F(MasterTest, TaskStateMetrics)
+{
+  Clock::pause();
+
+  master::Flags masterFlags = CreateMasterFlags();
+  Try<Owned<cluster::Master>> master = StartMaster(masterFlags);
+  ASSERT_SOME(master);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+
+  slave::Flags slaveFlags = CreateSlaveFlags();
+
+  Fetcher fetcher(slaveFlags);
+
+  Try<MesosContainerizer*> _containerizer =
+    MesosContainerizer::create(slaveFlags, true, &fetcher);
+  ASSERT_SOME(_containerizer);
+  Owned<slave::Containerizer> containerizer(_containerizer.get());
+
+  Try<Owned<cluster::Slave>> slave =
+    StartSlave(detector.get(), containerizer.get(), slaveFlags);
+  ASSERT_SOME(slave);
+
+  // Advance the clock to trigger agent registration.
+  Clock::advance(slaveFlags.registration_backoff_factor);
+
+  v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.set_checkpoint(true);
+
+  auto scheduler = std::make_shared<v1::MockHTTPScheduler>();
+
+  EXPECT_CALL(*scheduler, heartbeat(_))
+    .WillRepeatedly(Return()); // Ignore heartbeats.
+
+  EXPECT_CALL(*scheduler, connected(_))
+    .WillOnce(v1::scheduler::SendSubscribe(frameworkInfo));
+
+  Future<Event::Subscribed> subscribed;
+  EXPECT_CALL(*scheduler, subscribed(_, _))
+    .WillOnce(FutureArg<1>(&subscribed));
+
+  Future<Event::Offers> offers1;
+  EXPECT_CALL(*scheduler, offers(_, _))
+    .WillOnce(FutureArg<1>(&offers1));
+
+  v1::scheduler::TestMesos mesos(
+      master.get()->pid,
+      ContentType::PROTOBUF,
+      scheduler);
+
+  AWAIT_READY(subscribed);
+
+  v1::FrameworkID frameworkId(subscribed->framework_id());
+  frameworkInfo.mutable_id()->CopyFrom(frameworkId);
+
+  AWAIT_READY(offers1);
+  ASSERT_FALSE(offers1->offers().empty());
+
+  const v1::Offer& offer1 = offers1->offers(0);
+  const v1::AgentID& agentId = offer1.agent_id();
+
+  // Set `refuse_seconds` to zero so that the remaining resources
+  // are offered again immediately.
+  v1::Filters filters;
+  filters.set_refuse_seconds(0);
+
+  // The first task is long-running.
+  v1::TaskInfo task1 = v1::createTask(
+      agentId,
+      v1::Resources::parse("cpus:0.1;mem:64").get(),
+      SLEEP_COMMAND(300));
+
+  testing::Sequence taskSequence1;
+  Future<Event::Update> runningUpdate;
+
+  EXPECT_CALL(
+      *scheduler,
+      update(_, AllOf(
+          TaskStatusUpdateTaskIdEq(task1),
+          TaskStatusUpdateStateEq(v1::TASK_STARTING))))
+    .InSequence(taskSequence1)
+    .WillOnce(v1::scheduler::SendAcknowledge(frameworkId, agentId));
+
+  EXPECT_CALL(
+      *scheduler,
+      update(_, AllOf(
+          TaskStatusUpdateTaskIdEq(task1),
+          TaskStatusUpdateStateEq(v1::TASK_RUNNING))))
+    .InSequence(taskSequence1)
+    .WillOnce(DoAll(
+        v1::scheduler::SendAcknowledge(frameworkId, agentId),
+        FutureArg<1>(&runningUpdate)));
+
+  mesos.send(
+      v1::createCallAccept(
+          frameworkId,
+          offer1,
+          {v1::LAUNCH({task1})},
+          filters));
+
+  AWAIT_READY(runningUpdate);
+
+  Future<Event::Offers> offers2;
+  EXPECT_CALL(*scheduler, offers(_, _))
+    .WillOnce(FutureArg<1>(&offers2));
+
+  Clock::advance(masterFlags.allocation_interval);
+
+  AWAIT_READY(offers2);
+  ASSERT_FALSE(offers2->offers().empty());
+
+  const v1::Offer& offer2 = offers2->offers(0);
+
+  // The second task finishes immediately. Its TASK_FINISHED status update is
+  // never acknowledged so that the agent will include the task in its
+  // reregistration message.
+
+  v1::TaskInfo task2 = v1::createTask(
+      agentId,
+      v1::Resources::parse("cpus:0.1;mem:64").get(),
+      "echo done");
+
+  testing::Sequence taskSequence2;
+  Future<Event::Update> finishedUpdate;
+
+  EXPECT_CALL(
+      *scheduler,
+      update(_, AllOf(
+          TaskStatusUpdateTaskIdEq(task2),
+          TaskStatusUpdateStateEq(v1::TASK_STARTING))))
+    .InSequence(taskSequence2)
+    .WillOnce(v1::scheduler::SendAcknowledge(frameworkId, agentId));
+
+  EXPECT_CALL(
+      *scheduler,
+      update(_, AllOf(
+          TaskStatusUpdateTaskIdEq(task2),
+          TaskStatusUpdateStateEq(v1::TASK_RUNNING))))
+    .InSequence(taskSequence2)
+    .WillOnce(v1::scheduler::SendAcknowledge(frameworkId, agentId));
+
+  EXPECT_CALL(
+      *scheduler,
+      update(_, AllOf(
+          TaskStatusUpdateTaskIdEq(task2),
+          TaskStatusUpdateStateEq(v1::TASK_FINISHED))))
+    .InSequence(taskSequence2)
+    .WillOnce(FutureArg<1>(&finishedUpdate))
+    .WillRepeatedly(Return()); // Ignore retries.
+
+  mesos.send(
+      v1::createCallAccept(
+          frameworkId,
+          offer2,
+          {v1::LAUNCH({task2})},
+          filters));
+
+  AWAIT_READY(finishedUpdate);
+
+  JSON::Object metrics1 = Metrics();
+
+  const string prefix =
+    master::getFrameworkMetricPrefix(devolve(frameworkInfo));
+
+  // Verify global master metrics.
+  EXPECT_EQ(1, metrics1.values["master/tasks_running"]);
+  EXPECT_EQ(1, metrics1.values["master/tasks_finished"]);
+
+  // Verify per-framework metrics.
+  EXPECT_EQ(1, metrics1.values[prefix + "tasks/active/task_running"]);
+  EXPECT_EQ(1, metrics1.values[prefix + "tasks/terminal/task_finished"]);
+
+  // Fail over the agent to ensure that the master's task state metrics
+  // remain correct after agent reregistration.
+  slave.get()->terminate();
+  slave->reset();
+
+  Future<ReregisterExecutorMessage> reregisterExecutorMessage =
+    FUTURE_PROTOBUF(ReregisterExecutorMessage(), _, _);
+  Future<ReregisterSlaveMessage> reregisterSlaveMessage =
+    FUTURE_PROTOBUF(ReregisterSlaveMessage(), _, _);
+  Future<SlaveReregisteredMessage> slaveReregisteredMessage =
+    FUTURE_PROTOBUF(SlaveReregisteredMessage(), _, _);
+
+  // Restart the agent with a new containerizer.
+  _containerizer = MesosContainerizer::create(slaveFlags, true, &fetcher);
+  ASSERT_SOME(_containerizer);
+  containerizer.reset(_containerizer.get());
+
+  slave = StartSlave(detector.get(), containerizer.get(), slaveFlags);
+  ASSERT_SOME(slave);
+
+  // Ensure that the executor has reregistered before we advance the clock,
+  // to avoid timing out the executor.
+  AWAIT_READY(reregisterExecutorMessage);
+
+  // Ensure that agent recovery completes.
+  Clock::advance(slaveFlags.executor_reregistration_timeout);
+  Clock::settle();
+
+  // Ensure that the agent reregisters.
+  Clock::advance(slaveFlags.registration_backoff_factor);
+
+  // Resume the clock to avoid deadlocks related to agent registration.
+  // See MESOS-8828.
+  Clock::resume();
+
+  AWAIT_READY(reregisterSlaveMessage);
+
+  // The agent should provide both of its tasks during reregistration, since one
+  // is still running and the other has an unacknowledged terminal update. We
+  // want to verify that metric values are correct after this occurs.
+  EXPECT_EQ(2, reregisterSlaveMessage->tasks_size());
+
+  AWAIT_READY(slaveReregisteredMessage);
+
+  JSON::Object metrics2 = Metrics();
+
+  // Verify global master metrics.
+  EXPECT_EQ(1, metrics2.values["master/tasks_running"]);
+  EXPECT_EQ(1, metrics2.values["master/tasks_finished"]);
+
+  // Verify per-framework metrics.
+  EXPECT_EQ(1, metrics2.values[prefix + "tasks/active/task_running"]);
+  EXPECT_EQ(1, metrics2.values[prefix + "tasks/terminal/task_finished"]);
+}
+
+
 class MasterTestPrePostReservationRefinement
   : public MasterTest,
     public WithParamInterface<bool> {