You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by gi...@apache.org on 2018/10/08 17:43:03 UTC

[mesos] branch master updated (17c1d7d -> e135b7f)

This is an automated email from the ASF dual-hosted git repository.

gilbert pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


    from 17c1d7d  Fused constructors of `MethodNotAllowed` into one.
     new 200a532  Fixed the nested container launch failure on the agent upgrade case.
     new 36a64c8  Added an unit test for agent recovery with new cgroup subsystems.
     new e135b7f  Added MESOS-9295 to 1.7.1 CHANGELOG.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGELOG                                          |   1 +
 .../mesos/isolators/cgroups/cgroups.cpp            |  34 +++--
 src/tests/containerizer/cgroups_isolator_tests.cpp | 147 +++++++++++++++++++++
 3 files changed, 174 insertions(+), 8 deletions(-)


[mesos] 02/03: Added an unit test for agent recovery with new cgroup subsystems.

Posted by gi...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

gilbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 36a64c869cb04704047b86d3f8d11f1399aa8a8c
Author: Gilbert Song <so...@gmail.com>
AuthorDate: Fri Oct 5 12:19:01 2018 -0700

    Added an unit test for agent recovery with new cgroup subsystems.
    
    Review: https://reviews.apache.org/r/68941
---
 src/tests/containerizer/cgroups_isolator_tests.cpp | 147 +++++++++++++++++++++
 1 file changed, 147 insertions(+)

diff --git a/src/tests/containerizer/cgroups_isolator_tests.cpp b/src/tests/containerizer/cgroups_isolator_tests.cpp
index 368ab93..fccab20 100644
--- a/src/tests/containerizer/cgroups_isolator_tests.cpp
+++ b/src/tests/containerizer/cgroups_isolator_tests.cpp
@@ -1904,6 +1904,153 @@ TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_AutoLoadSubsystems)
 }
 
 
+// This test verifies that after the agent recovery/upgrade, nested
+// containers could still be launched under old containers which
+// were launched before agent restarts if there are new cgroup
+// subsystems are added in the agent cgroup isolation.
+TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_AgentRecoveryWithNewCgroupSubsystems)
+{
+  // Disable AuthN on the agent.
+  slave::Flags flags = CreateSlaveFlags();
+  flags.isolation = "filesystem/linux,docker/runtime,cgroups/mem";
+  flags.image_providers = "docker";
+  flags.authenticate_http_readwrite = false;
+
+  Try<Owned<cluster::Master>> master = StartMaster();
+  ASSERT_SOME(master);
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+
+  // Start the slave with a static process ID. This allows the executor to
+  // reconnect with the slave upon a process restart.
+  const string id("agent");
+
+  Try<Owned<cluster::Slave>> slave = StartSlave(detector.get(), id, flags);
+  ASSERT_SOME(slave);
+
+  auto scheduler = std::make_shared<v1::MockHTTPScheduler>();
+
+  v1::FrameworkInfo frameworkInfo = v1::DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.set_checkpoint(true);
+
+  EXPECT_CALL(*scheduler, connected(_))
+    .WillOnce(v1::scheduler::SendSubscribe(frameworkInfo));
+
+  Future<Event::Subscribed> subscribed;
+  EXPECT_CALL(*scheduler, subscribed(_, _))
+    .WillOnce(FutureArg<1>(&subscribed));
+
+  Future<Event::Offers> offers1;
+  EXPECT_CALL(*scheduler, offers(_, _))
+    .WillOnce(FutureArg<1>(&offers1))
+    .WillRepeatedly(Return());
+
+  EXPECT_CALL(*scheduler, heartbeat(_))
+    .WillRepeatedly(Return()); // Ignore heartbeats.
+
+  v1::scheduler::TestMesos mesos(
+      master.get()->pid, ContentType::PROTOBUF, scheduler);
+
+  AWAIT_READY(subscribed);
+  v1::FrameworkID frameworkId(subscribed->framework_id());
+
+  v1::ExecutorInfo executorInfo = v1::createExecutorInfo(
+      "test_default_executor",
+      None(),
+      "cpus:0.1;mem:32;disk:32",
+      v1::ExecutorInfo::DEFAULT);
+
+  // Update `executorInfo` with the subscribed `frameworkId`.
+  executorInfo.mutable_framework_id()->CopyFrom(frameworkId);
+
+  AWAIT_READY(offers1);
+  ASSERT_FALSE(offers1->offers().empty());
+
+  const v1::Offer& offer1 = offers1->offers(0);
+
+  v1::TaskInfo taskInfo1 = v1::createTask(
+      offer1.agent_id(),
+      v1::Resources::parse("cpus:0.1;mem:32;disk:32").get(),
+      "sleep 1000");
+
+  Future<v1::scheduler::Event::Update> startingUpdate1;
+  Future<v1::scheduler::Event::Update> runningUpdate1;
+  EXPECT_CALL(*scheduler, update(_, _))
+    .WillOnce(DoAll(
+        FutureArg<1>(&startingUpdate1),
+        v1::scheduler::SendAcknowledge(frameworkId, offer1.agent_id())))
+    .WillOnce(DoAll(
+        FutureArg<1>(&runningUpdate1),
+        v1::scheduler::SendAcknowledge(frameworkId, offer1.agent_id())))
+    .WillRepeatedly(Return());
+
+  mesos.send(
+      v1::createCallAccept(
+          frameworkId,
+          offer1,
+          {v1::LAUNCH_GROUP(
+              executorInfo, v1::createTaskGroupInfo({taskInfo1}))}));
+
+  AWAIT_READY(startingUpdate1);
+  ASSERT_EQ(v1::TASK_STARTING, startingUpdate1->status().state());
+  ASSERT_EQ(taskInfo1.task_id(), startingUpdate1->status().task_id());
+
+  AWAIT_READY(runningUpdate1);
+  ASSERT_EQ(v1::TASK_RUNNING, runningUpdate1->status().state());
+  ASSERT_EQ(taskInfo1.task_id(), runningUpdate1->status().task_id());
+
+  slave.get()->terminate();
+  slave->reset();
+
+  Future<Nothing> __recover = FUTURE_DISPATCH(_, &Slave::__recover);
+
+  // Update the cgroup isolation to introduce new subsystems.
+  flags.isolation = "filesystem/linux,docker/runtime,cgroups/all";
+  slave = this->StartSlave(detector.get(), id, flags);
+  ASSERT_SOME(slave);
+
+  AWAIT_READY(__recover);
+
+  Future<Event::Offers> offers2;
+  EXPECT_CALL(*scheduler, offers(_, _))
+    .WillOnce(FutureArg<1>(&offers2))
+    .WillRepeatedly(Return());
+
+  AWAIT_READY(offers2);
+  ASSERT_FALSE(offers2->offers().empty());
+
+  const v1::Offer& offer2 = offers2->offers(0);
+
+  v1::TaskInfo taskInfo2 = v1::createTask(
+      offer2.agent_id(),
+      v1::Resources::parse("cpus:0.1;mem:32;disk:32").get(),
+      "sleep 1000");
+
+  Future<v1::scheduler::Event::Update> startingUpdate2;
+  Future<v1::scheduler::Event::Update> runningUpdate2;
+  EXPECT_CALL(*scheduler, update(_, _))
+    .WillOnce(DoAll(
+        FutureArg<1>(&startingUpdate2),
+        v1::scheduler::SendAcknowledge(frameworkId, offer2.agent_id())))
+    .WillOnce(FutureArg<1>(&runningUpdate2));
+
+  mesos.send(
+      v1::createCallAccept(
+          frameworkId,
+          offer2,
+          {v1::LAUNCH_GROUP(
+              executorInfo, v1::createTaskGroupInfo({taskInfo2}))}));
+
+  AWAIT_READY(startingUpdate2);
+  ASSERT_EQ(v1::TASK_STARTING, startingUpdate2->status().state());
+  ASSERT_EQ(taskInfo2.task_id(), startingUpdate2->status().task_id());
+
+  AWAIT_READY(runningUpdate2);
+  ASSERT_EQ(v1::TASK_RUNNING, runningUpdate2->status().state());
+  ASSERT_EQ(taskInfo2.task_id(), runningUpdate2->status().task_id());
+}
+
+
 // This test verifies the container-specific cgroups are correctly mounted
 // inside the nested container.
 TEST_F(CgroupsIsolatorTest, ROOT_CGROUPS_NestedContainerSpecificCgroupsMount)


[mesos] 01/03: Fixed the nested container launch failure on the agent upgrade case.

Posted by gi...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

gilbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 200a532d33b647cc26d9566bbc1765bc039e699d
Author: Gilbert Song <so...@gmail.com>
AuthorDate: Thu Oct 4 16:54:24 2018 -0700

    Fixed the nested container launch failure on the agent upgrade case.
    
    If there are new cgroup subsystems are added after the agent upgrad
    or recovery, new nested container launched under old containers that
    are launched before the recovery would fail, because it cannot assign
    its pid to the non-existed cgroup hierarchy. We should skip those
    new cgroup subsystems for nested containers under old containers.
    
    Review: https://reviews.apache.org/r/68929
---
 .../mesos/isolators/cgroups/cgroups.cpp            | 34 +++++++++++++++++-----
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
index 11dfbab..fbb1b43 100644
--- a/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
+++ b/src/slave/containerizer/mesos/isolators/cgroups/cgroups.cpp
@@ -340,10 +340,13 @@ Future<Nothing> CgroupsIsolatorProcess::___recover(
   // TODO(haosdent): Use foreachkey once MESOS-5037 is resolved.
   foreach (const string& hierarchy, subsystems.keys()) {
     if (!cgroups::exists(hierarchy, cgroup)) {
-      // This may occur if the executor has exited and the isolator
-      // has destroyed the cgroup but the agent dies before noticing
-      // this. This will be detected when the containerizer tries to
-      // monitor the executor's pid.
+      // This may occur in two cases:
+      // 1. If the executor has exited and the isolator has destroyed
+      //    the cgroup but the agent dies before noticing this. This
+      //    will be detected when the containerizer tries to monitor
+      //    the executor's pid.
+      // 2. After the agent recovery/upgrade, new cgroup subsystems
+      //    are added to the agent cgroup isolation configuration.
       LOG(WARNING) << "Couldn't find the cgroup '" << cgroup << "' "
                    << "in hierarchy '" << hierarchy << "' "
                    << "for container " << containerId;
@@ -677,18 +680,33 @@ Future<Nothing> CgroupsIsolatorProcess::isolate(
     return Failure("Failed to isolate the container: Unknown root container");
   }
 
+  const string& cgroup = infos[rootContainerId]->cgroup;
+
   // TODO(haosdent): Use foreachkey once MESOS-5037 is resolved.
   foreach (const string& hierarchy, subsystems.keys()) {
+    // If new cgroup subsystems are added after the agent
+    // upgrade, the newly added cgroup subsystems do not
+    // exist on old container's cgroup hierarchy. So skip
+    // assigning the pid to this cgroup subsystem.
+    if (containerId.has_parent() && !cgroups::exists(hierarchy, cgroup)) {
+      LOG(INFO) << "Skipping assigning pid " << stringify(pid)
+                << " to cgroup at '" << path::join(hierarchy, cgroup)
+                << "' for container " << containerId
+                << " because its parent container " << containerId.parent()
+                << " does not have this cgroup hierarchy";
+      continue;
+    }
+
     Try<Nothing> assign = cgroups::assign(
         hierarchy,
-        infos[rootContainerId]->cgroup,
+        cgroup,
         pid);
 
     if (assign.isError()) {
       string message =
-        "Failed to assign pid " + stringify(pid) + " to cgroup at "
-        "'" + path::join(hierarchy, infos[rootContainerId]->cgroup) + "'"
-        ": " + assign.error();
+        "Failed to assign container " + stringify(containerId) +
+        " pid " + stringify(pid) + " to cgroup at '" +
+        path::join(hierarchy, cgroup) + "': " + assign.error();
 
       LOG(ERROR) << message;
 


[mesos] 03/03: Added MESOS-9295 to 1.7.1 CHANGELOG.

Posted by gi...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

gilbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit e135b7f2175c01fd67b71b22cdc325ac37853a9d
Author: Gilbert Song <so...@gmail.com>
AuthorDate: Mon Oct 8 10:34:27 2018 -0700

    Added MESOS-9295 to 1.7.1 CHANGELOG.
---
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG b/CHANGELOG
index 6a47201..8756474 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -15,6 +15,7 @@ Release Notes - Mesos - Version 1.7.1 (WIP)
   * [MESOS-9274] - v1 JAVA scheduler library can drop TEARDOWN upon destruction.
   * [MESOS-9279] - Docker Containerizer 'usage' call might be expensive if mount table is big.
   * [MESOS-9283] - Docker containerizer actor can get backlogged with large number of containers.
+  * [MESOS-9295] - Nested container launch could fail if the agent upgrade with new cgroup subsystems.
 
 ** Improvement:
   * [MESOS-6765] - Make the Resources wrapper "copy-on-write" to improve performance.