You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by gi...@apache.org on 2018/04/17 18:56:01 UTC

[1/5] mesos git commit: Added unit test slave recovery for default executor tests.

Repository: mesos
Updated Branches:
  refs/heads/master a95d9b8fb -> f62c22a99


Added unit test slave recovery for default executor tests.

Review: https://reviews.apache.org/r/66538


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/2bfcbccd
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/2bfcbccd
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/2bfcbccd

Branch: refs/heads/master
Commit: 2bfcbccdeeb37c5f83e571113b03fa904b3fee05
Parents: a95d9b8
Author: Gilbert Song <so...@gmail.com>
Authored: Sat Apr 7 01:38:50 2018 -0700
Committer: Gilbert Song <so...@gmail.com>
Committed: Tue Apr 17 10:49:24 2018 -0700

----------------------------------------------------------------------
 src/tests/default_executor_tests.cpp | 98 +++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/2bfcbccd/src/tests/default_executor_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/default_executor_tests.cpp b/src/tests/default_executor_tests.cpp
index 293dd20..73a446c 100644
--- a/src/tests/default_executor_tests.cpp
+++ b/src/tests/default_executor_tests.cpp
@@ -1372,6 +1372,104 @@ TEST_P(DefaultExecutorTest, ReservedResources)
 }
 
 
+// This test verifies that the agent could recover if the agent
+// metadata is checkpointed.
+TEST_P(DefaultExecutorTest, SlaveRecoveryWithMetadataCheckpointed)
+{
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  slave::Flags flags = CreateSlaveFlags();
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), flags);
+  ASSERT_SOME(slave);
+
+  auto scheduler = std::make_shared<v1::MockHTTPScheduler>();
+
+  v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.set_roles(0, DEFAULT_TEST_ROLE);
+  frameworkInfo.set_checkpoint(true);
+
+  EXPECT_CALL(*scheduler, connected(_))
+    .WillOnce(v1::scheduler::SendSubscribe(frameworkInfo));
+
+  Future<v1::scheduler::Event::Subscribed> subscribed;
+  EXPECT_CALL(*scheduler, subscribed(_, _))
+    .WillOnce(FutureArg<1>(&subscribed));
+
+  Future<v1::scheduler::Event::Offers> offers;
+  EXPECT_CALL(*scheduler, offers(_, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return());
+
+  EXPECT_CALL(*scheduler, heartbeat(_))
+    .WillRepeatedly(Return()); // Ignore heartbeats.
+
+  v1::scheduler::TestMesos mesos(
+      master.get()->pid,
+      ContentType::PROTOBUF,
+      scheduler);
+
+  AWAIT_READY(subscribed);
+
+  v1::FrameworkID frameworkId(subscribed->framework_id());
+  v1::ExecutorInfo executorInfo = v1::createExecutorInfo(
+      v1::DEFAULT_EXECUTOR_ID,
+      None(),
+      "cpus:0.1;mem:32;disk:32",
+      v1::ExecutorInfo::DEFAULT,
+      frameworkId);
+
+  AWAIT_READY(offers);
+  ASSERT_FALSE(offers->offers().empty());
+
+  const v1::Offer& offer = offers->offers(0);
+  const v1::AgentID& agentId = offer.agent_id();
+
+  v1::TaskInfo taskInfo = v1::createTask(
+      agentId,
+      v1::Resources::parse("cpus:0.1;mem:32;disk:32").get(),
+      "sleep 1000");
+
+  v1::Offer::Operation launchGroup =
+    v1::LAUNCH_GROUP(executorInfo, v1::createTaskGroupInfo({taskInfo}));
+
+  Future<v1::scheduler::Event::Update> startingUpdate;
+  Future<v1::scheduler::Event::Update> runningUpdate;
+  EXPECT_CALL(*scheduler, update(_, _))
+    .WillOnce(DoAll(
+        FutureArg<1>(&startingUpdate),
+        v1::scheduler::SendAcknowledge(frameworkId, agentId)))
+    .WillOnce(DoAll(
+        FutureArg<1>(&runningUpdate),
+        v1::scheduler::SendAcknowledge(frameworkId, agentId)))
+    .WillRepeatedly(Return()); // Ignore subsequent status updates.
+
+  mesos.send(v1::createCallAccept(frameworkId, offer, {launchGroup}));
+
+  AWAIT_READY(startingUpdate);
+  ASSERT_EQ(v1::TASK_STARTING, startingUpdate->status().state());
+  ASSERT_EQ(taskInfo.task_id(), startingUpdate->status().task_id());
+
+  AWAIT_READY(runningUpdate);
+  ASSERT_EQ(v1::TASK_RUNNING, runningUpdate->status().state());
+  EXPECT_EQ(taskInfo.task_id(), runningUpdate->status().task_id());
+  EXPECT_TRUE(runningUpdate->status().has_timestamp());
+  ASSERT_TRUE(runningUpdate->status().has_container_status());
+
+  slave.get()->terminate();
+  slave->reset();
+
+  Future<Nothing> _recover = FUTURE_DISPATCH(_, &Slave::_recover);
+
+  slave = this->StartSlave(detector.get(), flags);
+  ASSERT_SOME(slave);
+
+  AWAIT_READY(_recover);
+}
+
+
 // This is a regression test for MESOS-7926. It verifies that if
 // the default executor process is killed, the future of the nested
 // container destroy will be discarded and that discard will


[4/5] mesos git commit: Added default executor test for agent recovery without metadata.

Posted by gi...@apache.org.
Added default executor test for agent recovery without metadata.

Review: https://reviews.apache.org/r/66541


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/bd447bb5
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/bd447bb5
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/bd447bb5

Branch: refs/heads/master
Commit: bd447bb5c0295b7cb9f5773b02d1e4cb52bd154e
Parents: eeaf505
Author: Gilbert Song <so...@gmail.com>
Authored: Sun Apr 8 20:01:20 2018 -0700
Committer: Gilbert Song <so...@gmail.com>
Committed: Tue Apr 17 10:49:38 2018 -0700

----------------------------------------------------------------------
 src/tests/default_executor_tests.cpp | 107 ++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/bd447bb5/src/tests/default_executor_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/default_executor_tests.cpp b/src/tests/default_executor_tests.cpp
index 73a446c..bf849c4 100644
--- a/src/tests/default_executor_tests.cpp
+++ b/src/tests/default_executor_tests.cpp
@@ -1470,6 +1470,113 @@ TEST_P(DefaultExecutorTest, SlaveRecoveryWithMetadataCheckpointed)
 }
 
 
+#ifdef __linux__
+// This test verifies that the agent could recover if the agent
+// metadata is not checkpointed. This is a regression test for
+// MESOS-8416.
+//
+// TODO(gilbert): For now, the test is linux specific because
+// the posix launcher is not able to destroy orphan containers
+// after recovery. Remove the `#ifdef __linux__` once MESOS-8771
+// is fixed.
+TEST_P(DefaultExecutorTest, ROOT_SlaveRecoveryWithoutMetadataCheckpointed)
+{
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  slave::Flags flags = CreateSlaveFlags();
+  flags.launcher = "linux";
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), flags);
+  ASSERT_SOME(slave);
+
+  auto scheduler = std::make_shared<v1::MockHTTPScheduler>();
+
+  v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.set_roles(0, DEFAULT_TEST_ROLE);
+  frameworkInfo.set_checkpoint(false);
+
+  EXPECT_CALL(*scheduler, connected(_))
+    .WillOnce(v1::scheduler::SendSubscribe(frameworkInfo));
+
+  Future<v1::scheduler::Event::Subscribed> subscribed;
+  EXPECT_CALL(*scheduler, subscribed(_, _))
+    .WillOnce(FutureArg<1>(&subscribed));
+
+  Future<v1::scheduler::Event::Offers> offers;
+  EXPECT_CALL(*scheduler, offers(_, _))
+    .WillOnce(FutureArg<1>(&offers))
+    .WillRepeatedly(Return());
+
+  EXPECT_CALL(*scheduler, heartbeat(_))
+    .WillRepeatedly(Return()); // Ignore heartbeats.
+
+  v1::scheduler::TestMesos mesos(
+      master.get()->pid,
+      ContentType::PROTOBUF,
+      scheduler);
+
+  AWAIT_READY(subscribed);
+
+  v1::FrameworkID frameworkId(subscribed->framework_id());
+  v1::ExecutorInfo executorInfo = v1::createExecutorInfo(
+      v1::DEFAULT_EXECUTOR_ID,
+      None(),
+      "cpus:0.1;mem:32;disk:32",
+      v1::ExecutorInfo::DEFAULT,
+      frameworkId);
+
+  AWAIT_READY(offers);
+  ASSERT_FALSE(offers->offers().empty());
+
+  const v1::Offer& offer = offers->offers(0);
+  const v1::AgentID& agentId = offer.agent_id();
+
+  v1::TaskInfo taskInfo = v1::createTask(
+      agentId,
+      v1::Resources::parse("cpus:0.1;mem:32;disk:32").get(),
+      "sleep 1000");
+
+  v1::Offer::Operation launchGroup =
+    v1::LAUNCH_GROUP(executorInfo, v1::createTaskGroupInfo({taskInfo}));
+
+  Future<v1::scheduler::Event::Update> startingUpdate;
+  Future<v1::scheduler::Event::Update> runningUpdate;
+  EXPECT_CALL(*scheduler, update(_, _))
+    .WillOnce(DoAll(
+        FutureArg<1>(&startingUpdate),
+        v1::scheduler::SendAcknowledge(frameworkId, agentId)))
+    .WillOnce(DoAll(
+        FutureArg<1>(&runningUpdate),
+        v1::scheduler::SendAcknowledge(frameworkId, agentId)))
+    .WillRepeatedly(Return()); // Ignore subsequent status updates.
+
+  mesos.send(v1::createCallAccept(frameworkId, offer, {launchGroup}));
+
+  AWAIT_READY(startingUpdate);
+  ASSERT_EQ(v1::TASK_STARTING, startingUpdate->status().state());
+  ASSERT_EQ(taskInfo.task_id(), startingUpdate->status().task_id());
+
+  AWAIT_READY(runningUpdate);
+  ASSERT_EQ(v1::TASK_RUNNING, runningUpdate->status().state());
+  EXPECT_EQ(taskInfo.task_id(), runningUpdate->status().task_id());
+  EXPECT_TRUE(runningUpdate->status().has_timestamp());
+  ASSERT_TRUE(runningUpdate->status().has_container_status());
+
+  slave.get()->terminate();
+  slave->reset();
+
+  Future<Nothing> _recover = FUTURE_DISPATCH(_, &Slave::_recover);
+
+  slave = this->StartSlave(detector.get(), flags);
+  ASSERT_SOME(slave);
+
+  AWAIT_READY(_recover);
+}
+#endif // __linux__
+
+
 // This is a regression test for MESOS-7926. It verifies that if
 // the default executor process is killed, the future of the nested
 // container destroy will be discarded and that discard will


[5/5] mesos git commit: Added MESOS-8416 to 1.5.1 CHANGELOG.

Posted by gi...@apache.org.
Added MESOS-8416 to 1.5.1 CHANGELOG.


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f62c22a9
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f62c22a9
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f62c22a9

Branch: refs/heads/master
Commit: f62c22a9907c069d5553d703126c57762c83838d
Parents: bd447bb
Author: Gilbert Song <so...@gmail.com>
Authored: Tue Apr 17 10:46:29 2018 -0700
Committer: Gilbert Song <so...@gmail.com>
Committed: Tue Apr 17 10:49:45 2018 -0700

----------------------------------------------------------------------
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/f62c22a9/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index c02d56d..4a2185e 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -15,6 +15,7 @@ Release Notes - Mesos - Version 1.5.1 (WIP)
   * [MESOS-7742] - Race conditions in IOSwitchboard: listening on unix socket and premature closing of the connection.
   * [MESOS-8125] - Agent should properly handle recovering an executor when its pid is reused.
   * [MESOS-8411] - Killing a queued task can lead to the command executor never terminating.
+  * [MESOS-8416] - CHECK failure if trying to recover nested containers but the framework checkpointing is not enabled.
   * [MESOS-8468] - `LAUNCH_GROUP` failure tears down the default executor.
   * [MESOS-8488] - Docker bug can cause unkillable tasks.
   * [MESOS-8510] - URI disk profile adaptor does not consider plugin type for a profile.


[2/5] mesos git commit: Added unit test for recovering nested container without slave state.

Posted by gi...@apache.org.
Added unit test for recovering nested container without slave state.

Review: https://reviews.apache.org/r/66540


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/eeaf5052
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/eeaf5052
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/eeaf5052

Branch: refs/heads/master
Commit: eeaf5052f02701833aa7662960d6a24dff7f48ab
Parents: f80b0d0
Author: Gilbert Song <so...@gmail.com>
Authored: Tue Apr 10 12:22:31 2018 -0700
Committer: Gilbert Song <so...@gmail.com>
Committed: Tue Apr 17 10:49:30 2018 -0700

----------------------------------------------------------------------
 .../nested_mesos_containerizer_tests.cpp        | 107 +++++++++++++++++++
 1 file changed, 107 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/eeaf5052/src/tests/containerizer/nested_mesos_containerizer_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/nested_mesos_containerizer_tests.cpp b/src/tests/containerizer/nested_mesos_containerizer_tests.cpp
index 796bc40..661be81 100644
--- a/src/tests/containerizer/nested_mesos_containerizer_tests.cpp
+++ b/src/tests/containerizer/nested_mesos_containerizer_tests.cpp
@@ -1706,6 +1706,113 @@ TEST_F(NestedMesosContainerizerTest, ROOT_CGROUPS_RecoverNested)
 }
 
 
+// This test verifies that the agent could recover if the agent
+// metadata is empty but container runtime dir is not cleaned
+// up. This is a regression test for MESOS-8416.
+TEST_F(NestedMesosContainerizerTest,
+       ROOT_CGROUPS_RecoverNestedWithoutSlaveState)
+{
+  slave::Flags flags = CreateSlaveFlags();
+  flags.launcher = "linux";
+  flags.isolation = "cgroups/cpu,filesystem/linux,namespaces/pid";
+
+  Fetcher fetcher(flags);
+
+  Try<MesosContainerizer*> create = MesosContainerizer::create(
+      flags,
+      false,
+      &fetcher);
+
+  ASSERT_SOME(create);
+
+  Owned<MesosContainerizer> containerizer(create.get());
+
+  SlaveState state;
+  state.id = SlaveID();
+
+  AWAIT_READY(containerizer->recover(state));
+
+  ContainerID containerId;
+  containerId.set_value(id::UUID::random().toString());
+
+  ExecutorInfo executor = createExecutorInfo(
+      "executor",
+      "sleep 1000",
+      "cpus:1");
+
+  Try<string> directory = environment->mkdtemp();
+  ASSERT_SOME(directory);
+
+  Future<Containerizer::LaunchResult> launch = containerizer->launch(
+      containerId,
+      createContainerConfig(None(), executor, directory.get()),
+      map<string, string>(),
+      slave::paths::getForkedPidPath(
+          slave::paths::getMetaRootDir(flags.work_dir),
+          state.id,
+          executor.framework_id(),
+          executor.executor_id(),
+          containerId));
+
+  AWAIT_ASSERT_EQ(Containerizer::LaunchResult::SUCCESS, launch);
+
+  Future<ContainerStatus> status = containerizer->status(containerId);
+  AWAIT_READY(status);
+  ASSERT_TRUE(status->has_executor_pid());
+
+  // Now launch nested container.
+  ContainerID nestedContainerId;
+  nestedContainerId.mutable_parent()->CopyFrom(containerId);
+  nestedContainerId.set_value(id::UUID::random().toString());
+
+  launch = containerizer->launch(
+      nestedContainerId,
+      createContainerConfig(createCommandInfo("sleep 1000")),
+      map<string, string>(),
+      None());
+
+  AWAIT_ASSERT_EQ(Containerizer::LaunchResult::SUCCESS, launch);
+
+  status = containerizer->status(nestedContainerId);
+  AWAIT_READY(status);
+  ASSERT_TRUE(status->has_executor_pid());
+
+  // Force a delete on the containerizer before we create the new one.
+  containerizer.reset();
+
+  create = MesosContainerizer::create(
+      flags,
+      false,
+      &fetcher);
+
+  ASSERT_SOME(create);
+
+  containerizer.reset(create.get());
+
+  // Pass an empty slave state to simulate that the agent metadata
+  // is removed.
+  AWAIT_READY(containerizer->recover(state));
+
+  Future<Option<ContainerTermination>> wait = containerizer->wait(containerId);
+  Future<Option<ContainerTermination>> nestedWait = containerizer->wait(
+      nestedContainerId);
+
+  AWAIT_READY(nestedWait);
+  ASSERT_SOME(nestedWait.get());
+
+  // We expect a wait status of SIGKILL on the nested container.
+  // Since the kernel will destroy these via a SIGKILL, we expect
+  // a SIGKILL here.
+  ASSERT_TRUE(nestedWait.get()->has_status());
+  EXPECT_WTERMSIG_EQ(SIGKILL, nestedWait.get()->status());
+
+  AWAIT_READY(wait);
+  ASSERT_SOME(wait.get());
+  ASSERT_TRUE(wait.get()->has_status());
+  EXPECT_WTERMSIG_EQ(SIGKILL, wait.get()->status());
+}
+
+
 TEST_F(NestedMesosContainerizerTest, ROOT_CGROUPS_RecoverNestedWithoutConfig)
 {
   slave::Flags flags = CreateSlaveFlags();


[3/5] mesos git commit: Fixed the agent recovery crash if metadata is missing.

Posted by gi...@apache.org.
Fixed the agent recovery crash if metadata is missing.

This is the case that is missed when handling orphan containers
cleanup. When the agent metadata does not exist but the container
pid is chechpointed under the container runtime dir, then the
container should be regarded as orphan and should be cleaned up.

Review: https://reviews.apache.org/r/66539


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/f80b0d0b
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/f80b0d0b
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/f80b0d0b

Branch: refs/heads/master
Commit: f80b0d0b863acbb0681e2f8fc063c226686b45a0
Parents: 2bfcbcc
Author: Gilbert Song <so...@gmail.com>
Authored: Tue Apr 10 11:35:03 2018 -0700
Committer: Gilbert Song <so...@gmail.com>
Committed: Tue Apr 17 10:49:30 2018 -0700

----------------------------------------------------------------------
 src/slave/containerizer/mesos/containerizer.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/f80b0d0b/src/slave/containerizer/mesos/containerizer.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp
index d1d4c2a..6568126 100644
--- a/src/slave/containerizer/mesos/containerizer.cpp
+++ b/src/slave/containerizer/mesos/containerizer.cpp
@@ -856,10 +856,11 @@ Future<Nothing> MesosContainerizerProcess::recover(
       containerizer::paths::isStandaloneContainer(
           flags.runtime_dir, containerId);
 
+    const ContainerID& rootContainerId =
+      protobuf::getRootContainerId(containerId);
+
     Option<string> directory;
     if (containerId.has_parent()) {
-      const ContainerID& rootContainerId =
-        protobuf::getRootContainerId(containerId);
       CHECK(containers_.contains(rootContainerId));
 
       if (containers_[rootContainerId]->directory.isSome()) {
@@ -906,7 +907,8 @@ Future<Nothing> MesosContainerizerProcess::recover(
     // elsewhere.
     const bool isRecoverableNestedContainer =
       containerId.has_parent() &&
-      (containers_.contains(protobuf::getRootContainerId(containerId))) &&
+      containers_.contains(rootContainerId) &&
+      !orphans.contains(rootContainerId) &&
       pid.isSome() &&
       !containerizer::paths::getContainerForceDestroyOnRecovery(
           flags.runtime_dir, containerId);