You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by mz...@apache.org on 2019/09/23 20:43:10 UTC

[mesos] 03/04: Simplified recover resources when removing frameworks or agents.

This is an automated email from the ASF dual-hosted git repository.

mzhu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 5849c4af5fc702d9bb366da44f58ae93a2712f3a
Author: Meng Zhu <mz...@mesosphere.io>
AuthorDate: Thu Sep 12 11:46:23 2019 -0700

    Simplified recover resources when removing frameworks or agents.
    
    Review: https://reviews.apache.org/r/71476
---
 src/master/allocator/mesos/hierarchical.cpp | 85 ++++++++++++++++-------------
 src/master/allocator/mesos/hierarchical.hpp |  8 +++
 src/master/master.cpp                       | 12 ----
 3 files changed, 55 insertions(+), 50 deletions(-)

diff --git a/src/master/allocator/mesos/hierarchical.cpp b/src/master/allocator/mesos/hierarchical.cpp
index 62750cc..4728154 100644
--- a/src/master/allocator/mesos/hierarchical.cpp
+++ b/src/master/allocator/mesos/hierarchical.cpp
@@ -675,6 +675,15 @@ void HierarchicalAllocatorProcess::removeFramework(
 {
   CHECK(initialized);
 
+  // Free up resources on agents if any.
+  foreachvalue (Slave& slave, slaves) {
+    slave.increaseAvailable(
+        frameworkId,
+        slave.getOfferedOrAllocated().get(frameworkId).getOrElse(Resources()));
+  }
+
+  // Update tracking in the role tree and sorters.
+
   Framework& framework = *CHECK_NOTNONE(getFramework(frameworkId));
 
   foreach (const string& role, framework.roles) {
@@ -926,11 +935,13 @@ void HierarchicalAllocatorProcess::removeSlave(
   {
     const Slave& slave = *CHECK_NOTNONE(getSlave(slaveId));
 
-    // TODO(bmahler): Per MESOS-621, this should remove the allocations
-    // that any frameworks have on this slave. Otherwise the caller may
-    // "leak" allocated resources accidentally if they forget to recover
-    // all the resources. Fixing this would require more information
-    // than what we currently track in the allocator.
+    // Untrack resources in roleTree and sorter.
+    foreachpair (
+        const FrameworkID& frameworkId,
+        const Resources& resources,
+        slave.getOfferedOrAllocated()) {
+      untrackAllocatedResources(slaveId, frameworkId, resources);
+    }
 
     roleSorter->remove(slaveId, slave.getTotal());
 
@@ -1459,6 +1470,22 @@ void HierarchicalAllocatorProcess::recoverResources(
     return;
   }
 
+  Option<Framework*> framework = getFramework(frameworkId);
+  Option<Slave*> slave = getSlave(slaveId);
+
+  // No work to do if either the framework or the agent no longer exists.
+  //
+  // The framework may not exist if we dispatched Master::offer before we
+  // received MesosAllocatorProcess::removeFramework or
+  // MesosAllocatorProcess::deactivateFramework, in which case we will
+  // have already recovered all of its resources).
+  //
+  // The agent may not exist if we dispatched Master::offer before we
+  // received `removeSlave`.
+  if (framework.isNone() || slave.isNone()) {
+    return;
+  }
+
   // For now, we require that resources are recovered within a single
   // allocation role (since filtering in the same manner across roles
   // seems undesirable).
@@ -1472,41 +1499,26 @@ void HierarchicalAllocatorProcess::recoverResources(
 
   string role = allocations.begin()->first;
 
-  // Updated resources allocated to framework (if framework still
-  // exists, which it might not in the event that we dispatched
-  // Master::offer before we received
-  // MesosAllocatorProcess::removeFramework or
-  // MesosAllocatorProcess::deactivateFramework, in which case we will
-  // have already recovered all of its resources).
-  Option<Framework*> framework = getFramework(frameworkId);
+  // Update resources on the agent.
 
-  if (framework.isSome()) {
-    Sorter* frameworkSorter = CHECK_NOTNONE(getFrameworkSorter(role));
+  CHECK((*slave)->getTotalOfferedOrAllocated().contains(resources))
+    << "agent " << slaveId << " resources "
+    << (*slave)->getTotalOfferedOrAllocated() << " do not contain "
+    << resources;
 
-    if (frameworkSorter->contains(frameworkId.value())) {
-      untrackAllocatedResources(slaveId, frameworkId, resources);
-    }
-  }
+  (*slave)->increaseAvailable(frameworkId, resources);
 
-  // Update resources allocated on slave (if slave still exists,
-  // which it might not in the event that we dispatched Master::offer
-  // before we received Allocator::removeSlave).
-  Option<Slave*> slave = getSlave(slaveId);
+  VLOG(1) << "Recovered " << resources << " (total: " << (*slave)->getTotal()
+          << ", offered or allocated: "
+          << (*slave)->getTotalOfferedOrAllocated() << ")"
+          << " on agent " << slaveId << " from framework " << frameworkId;
 
-  if (slave.isSome()) {
-    CHECK((*slave)->getTotalOfferedOrAllocated().contains(resources))
-      << "agent " << slaveId << " resources "
-      << (*slave)->getTotalOfferedOrAllocated() << " do not contain "
-      << resources;
+  // Update role tree and sorter.
 
-    (*slave)->increaseAvailable(frameworkId, resources);
+  Sorter* frameworkSorter = CHECK_NOTNONE(getFrameworkSorter(role));
 
-    VLOG(1) << "Recovered " << resources
-            << " (total: " << (*slave)->getTotal()
-            << ", offered or allocated: "
-            << (*slave)->getTotalOfferedOrAllocated() << ")"
-            << " on agent " << slaveId
-            << " from framework " << frameworkId;
+  if (frameworkSorter->contains(frameworkId.value())) {
+    untrackAllocatedResources(slaveId, frameworkId, resources);
   }
 
   // No need to install the filter if 'filters' is none.
@@ -1514,10 +1526,7 @@ void HierarchicalAllocatorProcess::recoverResources(
     return;
   }
 
-  // No need to install the filter if slave/framework does not exist.
-  if (framework.isNone() || slave.isNone()) {
-    return;
-  }
+  // Update filters.
 
   // Create a refused resources filter.
   Try<Duration> timeout = Duration::create(Filters().refuse_seconds());
diff --git a/src/master/allocator/mesos/hierarchical.hpp b/src/master/allocator/mesos/hierarchical.hpp
index 36aa2fc..d42124f 100644
--- a/src/master/allocator/mesos/hierarchical.hpp
+++ b/src/master/allocator/mesos/hierarchical.hpp
@@ -321,8 +321,12 @@ public:
       const FrameworkID& frameworkId, const Resources& offeredOrAllocated_)
   {
     // Increasing available is to subtract offered or allocated.
+    if (offeredOrAllocated_.empty()) {
+      return;
+    }
 
     Resources& resources = offeredOrAllocated.at(frameworkId);
+    CHECK_CONTAINS(resources, offeredOrAllocated_);
     resources -= offeredOrAllocated_;
     if (resources.empty()) {
       offeredOrAllocated.erase(frameworkId);
@@ -336,6 +340,10 @@ public:
   void decreaseAvailable(
       const FrameworkID& frameworkId, const Resources& offeredOrAllocated_)
   {
+    if (offeredOrAllocated_.empty()) {
+      return;
+    }
+
     // Decreasing available is to add offered or allocated.
 
     offeredOrAllocated[frameworkId] += offeredOrAllocated_;
diff --git a/src/master/master.cpp b/src/master/master.cpp
index e5bc493..65994aa 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -11640,12 +11640,6 @@ void Master::_removeSlave(
 
   // We want to remove the slave first, to avoid the allocator
   // re-allocating the recovered resources.
-  //
-  // NOTE: Removing the slave is not sufficient for recovering the
-  // resources in the allocator, because the "Sorters" are updated
-  // only within recoverResources() (see MESOS-621). The calls to
-  // recoverResources() below are therefore required, even though
-  // the slave is already removed.
   allocator->removeSlave(slave->id);
 
   // Transition the tasks to lost and remove them.
@@ -11774,12 +11768,6 @@ void Master::__removeSlave(
 {
   // We want to remove the slave first, to avoid the allocator
   // re-allocating the recovered resources.
-  //
-  // NOTE: Removing the slave is not sufficient for recovering the
-  // resources in the allocator, because the "Sorters" are updated
-  // only within recoverResources() (see MESOS-621). The calls to
-  // recoverResources() below are therefore required, even though
-  // the slave is already removed.
   allocator->removeSlave(slave->id);
 
   // Transition tasks to TASK_UNREACHABLE/TASK_GONE_BY_OPERATOR/TASK_LOST