You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by mz...@apache.org on 2019/09/23 20:43:07 UTC

[mesos] branch master updated (6ded10f -> bdb3a69)

This is an automated email from the ASF dual-hosted git repository.

mzhu pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git.


    from 6ded10f  Adjusted CSI example framework for recent changes.
     new 82f2dfd  Fixed a bug where sorter may leak clients.
     new 81e10fc  Refactored framework role tracking logic in the allocator.
     new 5849c4a  Simplified recover resources when removing frameworks or agents.
     new bdb3a69  Added a test to ensure resources are recovered during agent removal.

The 4 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/master/allocator/mesos/hierarchical.cpp        | 149 +++++++++++----------
 src/master/allocator/mesos/hierarchical.hpp        |  35 ++++-
 src/master/allocator/mesos/sorter/drf/sorter.hpp   |   8 +-
 .../allocator/mesos/sorter/random/sorter.hpp       |   8 +-
 src/master/master.cpp                              |  12 --
 src/tests/hierarchical_allocator_tests.cpp         |  74 ++++++++++
 src/tests/sorter_tests.cpp                         |  16 +++
 7 files changed, 213 insertions(+), 89 deletions(-)


[mesos] 04/04: Added a test to ensure resources are recovered during agent removal.

Posted by mz...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

mzhu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit bdb3a6956f11ebd9ee40de947a079a169799298e
Author: Meng Zhu <mz...@mesosphere.io>
AuthorDate: Thu Sep 12 12:38:22 2019 -0700

    Added a test to ensure resources are recovered during agent removal.
    
    Review: https://reviews.apache.org/r/71479
---
 src/tests/hierarchical_allocator_tests.cpp | 74 ++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/src/tests/hierarchical_allocator_tests.cpp b/src/tests/hierarchical_allocator_tests.cpp
index 5f2e2b2..a13eb01 100644
--- a/src/tests/hierarchical_allocator_tests.cpp
+++ b/src/tests/hierarchical_allocator_tests.cpp
@@ -3066,6 +3066,80 @@ TEST_F(HierarchicalAllocatorTest, UpdateSlaveCapabilities)
 }
 
 
+// This is a regression test for MESOS-621. It ensures that when removing
+// an agent, its resources are fully recovered.
+TEST_F(HierarchicalAllocatorTest, RemoveSlaveRecoverResources)
+{
+  // Per MESOS-621, removing an agent was a two step process.
+  // Resources in the Slave struct is immediately recovered.
+  // But states in the role tree or the sorter would require
+  // an additional recoverResources call.
+  // We verify the fix by:
+  //  - Allocating a larger agent1 to framework1
+  //  - Allocating a smaller agent2 to framework2
+  //  - Remove agent1 (without calling recoverResources)
+  //  - Add agent3
+  //
+  // Agent3 should be allocated to framework1 since it has no resources
+  // (verfiying that the sorter states are correctly updated
+  // after the agent1 is removed).
+
+  Clock::pause();
+
+  initialize();
+
+  SlaveInfo slave1 = createSlaveInfo("cpus:2;mem:200");
+  allocator->addSlave(
+      slave1.id(),
+      slave1,
+      AGENT_CAPABILITIES(),
+      None(),
+      slave1.resources(),
+      {});
+
+  FrameworkInfo framework1 = createFrameworkInfo({"role"});
+  allocator->addFramework(framework1.id(), framework1, {}, true, {});
+
+  Allocation expected = Allocation(
+      framework1.id(), {{"role", {{slave1.id(), slave1.resources()}}}});
+
+  AWAIT_EXPECT_EQ(expected, allocations.get());
+
+  FrameworkInfo framework2 = createFrameworkInfo({"role"});
+  allocator->addFramework(framework2.id(), framework2, {}, true, {});
+
+  SlaveInfo slave2 = createSlaveInfo("cpus:1;mem:100");
+  allocator->addSlave(
+      slave2.id(),
+      slave2,
+      AGENT_CAPABILITIES(),
+      None(),
+      slave2.resources(),
+      {});
+
+  expected = Allocation(
+      framework2.id(), {{"role", {{slave2.id(), slave2.resources()}}}});
+
+  AWAIT_EXPECT_EQ(expected, allocations.get());
+
+  allocator->removeSlave(slave1.id());
+
+  SlaveInfo slave3 = createSlaveInfo("cpus:1;mem:100");
+  allocator->addSlave(
+      slave3.id(),
+      slave3,
+      AGENT_CAPABILITIES(),
+      None(),
+      slave3.resources(),
+      {});
+
+  expected = Allocation(
+      framework1.id(), {{"role", {{slave3.id(), slave3.resources()}}}});
+
+  AWAIT_EXPECT_EQ(expected, allocations.get());
+}
+
+
 // This is a white-box test to ensure that MESOS-9554 is fixed.
 // It ensures that if a framework is not capable of receiving
 // any resources on an agent, we still proceed to try allocating


[mesos] 03/04: Simplified recover resources when removing frameworks or agents.

Posted by mz...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

mzhu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 5849c4af5fc702d9bb366da44f58ae93a2712f3a
Author: Meng Zhu <mz...@mesosphere.io>
AuthorDate: Thu Sep 12 11:46:23 2019 -0700

    Simplified recover resources when removing frameworks or agents.
    
    Review: https://reviews.apache.org/r/71476
---
 src/master/allocator/mesos/hierarchical.cpp | 85 ++++++++++++++++-------------
 src/master/allocator/mesos/hierarchical.hpp |  8 +++
 src/master/master.cpp                       | 12 ----
 3 files changed, 55 insertions(+), 50 deletions(-)

diff --git a/src/master/allocator/mesos/hierarchical.cpp b/src/master/allocator/mesos/hierarchical.cpp
index 62750cc..4728154 100644
--- a/src/master/allocator/mesos/hierarchical.cpp
+++ b/src/master/allocator/mesos/hierarchical.cpp
@@ -675,6 +675,15 @@ void HierarchicalAllocatorProcess::removeFramework(
 {
   CHECK(initialized);
 
+  // Free up resources on agents if any.
+  foreachvalue (Slave& slave, slaves) {
+    slave.increaseAvailable(
+        frameworkId,
+        slave.getOfferedOrAllocated().get(frameworkId).getOrElse(Resources()));
+  }
+
+  // Update tracking in the role tree and sorters.
+
   Framework& framework = *CHECK_NOTNONE(getFramework(frameworkId));
 
   foreach (const string& role, framework.roles) {
@@ -926,11 +935,13 @@ void HierarchicalAllocatorProcess::removeSlave(
   {
     const Slave& slave = *CHECK_NOTNONE(getSlave(slaveId));
 
-    // TODO(bmahler): Per MESOS-621, this should remove the allocations
-    // that any frameworks have on this slave. Otherwise the caller may
-    // "leak" allocated resources accidentally if they forget to recover
-    // all the resources. Fixing this would require more information
-    // than what we currently track in the allocator.
+    // Untrack resources in roleTree and sorter.
+    foreachpair (
+        const FrameworkID& frameworkId,
+        const Resources& resources,
+        slave.getOfferedOrAllocated()) {
+      untrackAllocatedResources(slaveId, frameworkId, resources);
+    }
 
     roleSorter->remove(slaveId, slave.getTotal());
 
@@ -1459,6 +1470,22 @@ void HierarchicalAllocatorProcess::recoverResources(
     return;
   }
 
+  Option<Framework*> framework = getFramework(frameworkId);
+  Option<Slave*> slave = getSlave(slaveId);
+
+  // No work to do if either the framework or the agent no longer exists.
+  //
+  // The framework may not exist if we dispatched Master::offer before we
+  // received MesosAllocatorProcess::removeFramework or
+  // MesosAllocatorProcess::deactivateFramework, in which case we will
+  // have already recovered all of its resources).
+  //
+  // The agent may not exist if we dispatched Master::offer before we
+  // received `removeSlave`.
+  if (framework.isNone() || slave.isNone()) {
+    return;
+  }
+
   // For now, we require that resources are recovered within a single
   // allocation role (since filtering in the same manner across roles
   // seems undesirable).
@@ -1472,41 +1499,26 @@ void HierarchicalAllocatorProcess::recoverResources(
 
   string role = allocations.begin()->first;
 
-  // Updated resources allocated to framework (if framework still
-  // exists, which it might not in the event that we dispatched
-  // Master::offer before we received
-  // MesosAllocatorProcess::removeFramework or
-  // MesosAllocatorProcess::deactivateFramework, in which case we will
-  // have already recovered all of its resources).
-  Option<Framework*> framework = getFramework(frameworkId);
+  // Update resources on the agent.
 
-  if (framework.isSome()) {
-    Sorter* frameworkSorter = CHECK_NOTNONE(getFrameworkSorter(role));
+  CHECK((*slave)->getTotalOfferedOrAllocated().contains(resources))
+    << "agent " << slaveId << " resources "
+    << (*slave)->getTotalOfferedOrAllocated() << " do not contain "
+    << resources;
 
-    if (frameworkSorter->contains(frameworkId.value())) {
-      untrackAllocatedResources(slaveId, frameworkId, resources);
-    }
-  }
+  (*slave)->increaseAvailable(frameworkId, resources);
 
-  // Update resources allocated on slave (if slave still exists,
-  // which it might not in the event that we dispatched Master::offer
-  // before we received Allocator::removeSlave).
-  Option<Slave*> slave = getSlave(slaveId);
+  VLOG(1) << "Recovered " << resources << " (total: " << (*slave)->getTotal()
+          << ", offered or allocated: "
+          << (*slave)->getTotalOfferedOrAllocated() << ")"
+          << " on agent " << slaveId << " from framework " << frameworkId;
 
-  if (slave.isSome()) {
-    CHECK((*slave)->getTotalOfferedOrAllocated().contains(resources))
-      << "agent " << slaveId << " resources "
-      << (*slave)->getTotalOfferedOrAllocated() << " do not contain "
-      << resources;
+  // Update role tree and sorter.
 
-    (*slave)->increaseAvailable(frameworkId, resources);
+  Sorter* frameworkSorter = CHECK_NOTNONE(getFrameworkSorter(role));
 
-    VLOG(1) << "Recovered " << resources
-            << " (total: " << (*slave)->getTotal()
-            << ", offered or allocated: "
-            << (*slave)->getTotalOfferedOrAllocated() << ")"
-            << " on agent " << slaveId
-            << " from framework " << frameworkId;
+  if (frameworkSorter->contains(frameworkId.value())) {
+    untrackAllocatedResources(slaveId, frameworkId, resources);
   }
 
   // No need to install the filter if 'filters' is none.
@@ -1514,10 +1526,7 @@ void HierarchicalAllocatorProcess::recoverResources(
     return;
   }
 
-  // No need to install the filter if slave/framework does not exist.
-  if (framework.isNone() || slave.isNone()) {
-    return;
-  }
+  // Update filters.
 
   // Create a refused resources filter.
   Try<Duration> timeout = Duration::create(Filters().refuse_seconds());
diff --git a/src/master/allocator/mesos/hierarchical.hpp b/src/master/allocator/mesos/hierarchical.hpp
index 36aa2fc..d42124f 100644
--- a/src/master/allocator/mesos/hierarchical.hpp
+++ b/src/master/allocator/mesos/hierarchical.hpp
@@ -321,8 +321,12 @@ public:
       const FrameworkID& frameworkId, const Resources& offeredOrAllocated_)
   {
     // Increasing available is to subtract offered or allocated.
+    if (offeredOrAllocated_.empty()) {
+      return;
+    }
 
     Resources& resources = offeredOrAllocated.at(frameworkId);
+    CHECK_CONTAINS(resources, offeredOrAllocated_);
     resources -= offeredOrAllocated_;
     if (resources.empty()) {
       offeredOrAllocated.erase(frameworkId);
@@ -336,6 +340,10 @@ public:
   void decreaseAvailable(
       const FrameworkID& frameworkId, const Resources& offeredOrAllocated_)
   {
+    if (offeredOrAllocated_.empty()) {
+      return;
+    }
+
     // Decreasing available is to add offered or allocated.
 
     offeredOrAllocated[frameworkId] += offeredOrAllocated_;
diff --git a/src/master/master.cpp b/src/master/master.cpp
index e5bc493..65994aa 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -11640,12 +11640,6 @@ void Master::_removeSlave(
 
   // We want to remove the slave first, to avoid the allocator
   // re-allocating the recovered resources.
-  //
-  // NOTE: Removing the slave is not sufficient for recovering the
-  // resources in the allocator, because the "Sorters" are updated
-  // only within recoverResources() (see MESOS-621). The calls to
-  // recoverResources() below are therefore required, even though
-  // the slave is already removed.
   allocator->removeSlave(slave->id);
 
   // Transition the tasks to lost and remove them.
@@ -11774,12 +11768,6 @@ void Master::__removeSlave(
 {
   // We want to remove the slave first, to avoid the allocator
   // re-allocating the recovered resources.
-  //
-  // NOTE: Removing the slave is not sufficient for recovering the
-  // resources in the allocator, because the "Sorters" are updated
-  // only within recoverResources() (see MESOS-621). The calls to
-  // recoverResources() below are therefore required, even though
-  // the slave is already removed.
   allocator->removeSlave(slave->id);
 
   // Transition tasks to TASK_UNREACHABLE/TASK_GONE_BY_OPERATOR/TASK_LOST


[mesos] 02/04: Refactored framework role tracking logic in the allocator.

Posted by mz...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

mzhu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 81e10fc023cafda0570658dc7288c7b1f71df618
Author: Meng Zhu <mz...@mesosphere.io>
AuthorDate: Wed Sep 18 15:16:24 2019 -0700

    Refactored framework role tracking logic in the allocator.
    
    A framework is tracked under the role if the framework:
    (1) is subscribed to the role;
    Or
    (2) has resources allocated under the role.
    
    (2) could be true without (1). This is because we allow
    frameworks to update their roles without revoking their
    allocated or offered resources.
    
    This patch recognizes this fact and encapsulates the related
    checks in the helper.
    
    Benchmark result: no major performance impact.
    
    Master
    
    BENCHMARK_HierarchicalAllocator_WithQuotaParam.LargeAndSmallQuota/2
    Added 3000 agents in 90.816373ms
    Added 3000 frameworks in 17.202199348secs
    Made 3500 allocations in 15.46714944secs
    Made 0 allocation in 13.767830924secs
    
    Master + this patch:
    
    BENCHMARK_HierarchicalAllocator_WithQuotaParam.LargeAndSmallQuota/2
    Added 3000 agents in 86.782773ms
    Added 3000 frameworks in 18.174924815secs
    Made 3500 allocations in 15.813011454secs
    Made 0 allocation in 12.882917161secs
    
    Review: https://reviews.apache.org/r/71516
---
 src/master/allocator/mesos/hierarchical.cpp | 64 +++++++++++++++--------------
 src/master/allocator/mesos/hierarchical.hpp | 27 ++++++++----
 2 files changed, 54 insertions(+), 37 deletions(-)

diff --git a/src/master/allocator/mesos/hierarchical.cpp b/src/master/allocator/mesos/hierarchical.cpp
index 256fdce..62750cc 100644
--- a/src/master/allocator/mesos/hierarchical.cpp
+++ b/src/master/allocator/mesos/hierarchical.cpp
@@ -633,7 +633,7 @@ void HierarchicalAllocatorProcess::addFramework(
   const Framework& framework = *CHECK_NOTNONE(getFramework(frameworkId));
 
   foreach (const string& role, framework.roles) {
-    trackFrameworkUnderRole(frameworkId, role);
+    trackFrameworkUnderRole(framework, role);
 
     Sorter* frameworkSorter = CHECK_NOTNONE(getFrameworkSorter(role));
 
@@ -699,7 +699,10 @@ void HierarchicalAllocatorProcess::removeFramework(
       untrackAllocatedResources(slaveId, frameworkId, allocated);
     }
 
-    untrackFrameworkUnderRole(frameworkId, role);
+    CHECK(tryUntrackFrameworkUnderRole(framework, role))
+      << " Framework: " << frameworkId
+      << " role: " << role
+      << " allocation: " << allocation;
   }
 
   // Transfer ownership of this framework's metrics to
@@ -790,7 +793,8 @@ void HierarchicalAllocatorProcess::updateFramework(
     // under the role because a framework can unsubscribe from a role
     // while it still has resources allocated to the role.
     if (!isFrameworkTrackedUnderRole(frameworkId, role)) {
-      trackFrameworkUnderRole(frameworkId, role);
+      // TODO(mzhu): `CHECK` the above case.
+      trackFrameworkUnderRole(framework, role);
     }
 
     frameworkSorters.at(role)->activate(frameworkId.value());
@@ -801,11 +805,7 @@ void HierarchicalAllocatorProcess::updateFramework(
 
     frameworkSorter->deactivate(frameworkId.value());
 
-    // Stop tracking the framework under this role if there are
-    // no longer any resources allocated to it.
-    if (frameworkSorter->allocation(frameworkId.value()).empty()) {
-      untrackFrameworkUnderRole(frameworkId, role);
-    }
+    tryUntrackFrameworkUnderRole(framework, role);
 
     if (framework.offerFilters.contains(role)) {
       framework.offerFilters.erase(role);
@@ -1485,13 +1485,6 @@ void HierarchicalAllocatorProcess::recoverResources(
 
     if (frameworkSorter->contains(frameworkId.value())) {
       untrackAllocatedResources(slaveId, frameworkId, resources);
-
-      // Stop tracking the framework under this role if it's no longer
-      // subscribed and no longer has resources allocated to the role.
-      if ((*framework)->roles.count(role) == 0 &&
-          frameworkSorter->allocation(frameworkId.value()).empty()) {
-        untrackFrameworkUnderRole(frameworkId, role);
-      }
     }
   }
 
@@ -2768,7 +2761,7 @@ const Quota& HierarchicalAllocatorProcess::getQuota(const string& role) const
 
 
 void HierarchicalAllocatorProcess::trackFrameworkUnderRole(
-    const FrameworkID& frameworkId, const string& role)
+    const Framework& framework, const string& role)
 {
   CHECK(initialized);
 
@@ -2792,28 +2785,32 @@ void HierarchicalAllocatorProcess::trackFrameworkUnderRole(
     }
   }
 
-  roleTree.trackFramework(frameworkId, role);
+  roleTree.trackFramework(framework.frameworkId, role);
 
   Sorter* frameworkSorter = CHECK_NOTNONE(getFrameworkSorter(role));
 
-  CHECK_NOT_CONTAINS(*frameworkSorter, frameworkId.value())
+  CHECK_NOT_CONTAINS(*frameworkSorter, framework.frameworkId.value())
     << " for role " << role;
-  frameworkSorter->add(frameworkId.value());
+  frameworkSorter->add(framework.frameworkId.value());
 }
 
 
-void HierarchicalAllocatorProcess::untrackFrameworkUnderRole(
-    const FrameworkID& frameworkId, const string& role)
+bool HierarchicalAllocatorProcess::tryUntrackFrameworkUnderRole(
+    const Framework& framework, const string& role)
 {
   CHECK(initialized);
 
-  roleTree.untrackFramework(frameworkId, role);
-
   Sorter* frameworkSorter = CHECK_NOTNONE(getFrameworkSorter(role));
-
-  CHECK_CONTAINS(*frameworkSorter, frameworkId.value())
+  CHECK_CONTAINS(*frameworkSorter, framework.frameworkId.value())
     << " for role " << role;
-  frameworkSorter->remove(frameworkId.value());
+
+  if (!frameworkSorter->allocation(framework.frameworkId.value()).empty()) {
+    return false;
+  }
+
+  roleTree.untrackFramework(framework.frameworkId, role);
+
+  frameworkSorter->remove(framework.frameworkId.value());
 
   if (roleTree.get(role).isNone() ||
       (*roleTree.get(role))->frameworks().empty()) {
@@ -2821,6 +2818,8 @@ void HierarchicalAllocatorProcess::untrackFrameworkUnderRole(
     roleSorter->remove(role);
     frameworkSorters.erase(role);
   }
+
+  return true;
 }
 
 
@@ -2961,7 +2960,7 @@ void HierarchicalAllocatorProcess::trackAllocatedResources(
     // or may not be subscribed to the role. Either way, we need to
     // track the framework under the role.
     if (!isFrameworkTrackedUnderRole(frameworkId, role)) {
-      trackFrameworkUnderRole(frameworkId, role);
+      trackFrameworkUnderRole(*CHECK_NOTNONE(getFramework(frameworkId)), role);
     }
 
     CHECK_CONTAINS(*roleSorter, role);
@@ -2991,7 +2990,7 @@ void HierarchicalAllocatorProcess::untrackAllocatedResources(
   // But currently, a slave is removed first via `removeSlave()`
   // and later a call to `recoverResources()` occurs to recover
   // the framework's resources.
-  CHECK_CONTAINS(frameworks, frameworkId);
+  Framework* framework = CHECK_NOTNONE(getFramework(frameworkId));
 
   // TODO(bmahler): Calling allocations() is expensive since it has
   // to construct a map. Avoid this.
@@ -3007,10 +3006,15 @@ void HierarchicalAllocatorProcess::untrackAllocatedResources(
 
     roleTree.untrackOfferedOrAllocated(allocation);
 
-    frameworkSorter->unallocated(
-        frameworkId.value(), slaveId, allocation);
+    frameworkSorter->unallocated(frameworkId.value(), slaveId, allocation);
 
     roleSorter->unallocated(role, slaveId, allocation);
+
+    // If the framework is no longer subscribed to the role, we can try to
+    // untrack the framework under the role.
+    if (framework->roles.count(role) == 0) {
+      tryUntrackFrameworkUnderRole(*framework, role);
+    }
   }
 }
 
diff --git a/src/master/allocator/mesos/hierarchical.hpp b/src/master/allocator/mesos/hierarchical.hpp
index 5ea3791..36aa2fc 100644
--- a/src/master/allocator/mesos/hierarchical.hpp
+++ b/src/master/allocator/mesos/hierarchical.hpp
@@ -168,7 +168,16 @@ private:
   // By default, weights == DEFAULT_WEIGHT == 1.0.
   double weight_;
 
-  // IDs of the frameworks subscribed to the role, if any.
+  // IDs of the frameworks tracked under the role, if any.
+  // A framework is tracked under the role if the framework:
+  //
+  // (1) is subscribed to the role;
+  // *OR*
+  // (2) has resources allocated under the role.
+  //
+  // NOTE: (2) could be true without (1). This is because the allocator
+  // interface allows for a framework role to be removed without recovering
+  // resources offered or allocated to this role.
   hashset<FrameworkID> frameworks_;
 
   // Total allocated or offered scalar resources to this role, including
@@ -773,13 +782,17 @@ private:
 
   const Quota& getQuota(const std::string& role) const;
 
+  // Helpers to track and untrack a framework under a role.
+  // Frameworks should be tracked under a role either if it subscribes to the
+  // role *OR* it has resources allocated/offered to that role. when neither
+  // conditions are met, it should be untracked.
+  //
+  // `tryUntrackFrameworkUnderRole` returns true if the framework is untracked
+  // under the role.
   void trackFrameworkUnderRole(
-      const FrameworkID& frameworkId,
-      const std::string& role);
-
-  void untrackFrameworkUnderRole(
-      const FrameworkID& frameworkId,
-      const std::string& role);
+      const Framework& framework, const std::string& role);
+  bool tryUntrackFrameworkUnderRole(
+      const Framework& framework, const std::string& role);
 
   void suppressRoles(Framework& framework, const std::set<std::string>& roles);
   void reviveRoles(Framework& framework, const std::set<std::string>& roles);


[mesos] 01/04: Fixed a bug where sorter may leak clients.

Posted by mz...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

mzhu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 82f2dfd27449202afaf3feb08bb6d4ddeba7f515
Author: Meng Zhu <mz...@mesosphere.io>
AuthorDate: Wed Sep 18 16:45:46 2019 -0700

    Fixed a bug where sorter may leak clients.
    
    It is possible to update allocations to empty in the sorter.
    See MESOS-9015. When this happens, sorter will leak client
    entries, since it does not erase the corresponding entries.
    
    This patch fixes this issue. Also added a regression test.
    
    Review: https://reviews.apache.org/r/71515
---
 src/master/allocator/mesos/sorter/drf/sorter.hpp    |  8 +++++++-
 src/master/allocator/mesos/sorter/random/sorter.hpp |  8 +++++++-
 src/tests/sorter_tests.cpp                          | 16 ++++++++++++++++
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/master/allocator/mesos/sorter/drf/sorter.hpp b/src/master/allocator/mesos/sorter/drf/sorter.hpp
index 2a861e2..ab20a86 100644
--- a/src/master/allocator/mesos/sorter/drf/sorter.hpp
+++ b/src/master/allocator/mesos/sorter/drf/sorter.hpp
@@ -356,7 +356,7 @@ struct DRFSorter::Node
 
       totals -= quantitiesToRemove;
 
-      if (resources[slaveId].empty()) {
+      if (resources.at(slaveId).empty()) {
         resources.erase(slaveId);
       }
     }
@@ -382,6 +382,12 @@ struct DRFSorter::Node
       resources[slaveId] -= oldAllocation;
       resources[slaveId] += newAllocation;
 
+      // It is possible that allocations can be updated to empty.
+      // See MESOS-9015 and MESOS-9975.
+      if (resources.at(slaveId).empty()) {
+        resources.erase(slaveId);
+      }
+
       totals -= oldAllocationQuantities;
       totals += newAllocationQuantities;
     }
diff --git a/src/master/allocator/mesos/sorter/random/sorter.hpp b/src/master/allocator/mesos/sorter/random/sorter.hpp
index bc55809..06362ce 100644
--- a/src/master/allocator/mesos/sorter/random/sorter.hpp
+++ b/src/master/allocator/mesos/sorter/random/sorter.hpp
@@ -376,7 +376,7 @@ struct RandomSorter::Node
 
       totals -= quantitiesToRemove;
 
-      if (resources[slaveId].empty()) {
+      if (resources.at(slaveId).empty()) {
         resources.erase(slaveId);
       }
     }
@@ -402,6 +402,12 @@ struct RandomSorter::Node
       resources[slaveId] -= oldAllocation;
       resources[slaveId] += newAllocation;
 
+      // It is possible that allocations can be updated to empty.
+      // See MESOS-9015 and MESOS-9975.
+      if (resources.at(slaveId).empty()) {
+        resources.erase(slaveId);
+      }
+
       totals -= oldAllocationQuantities;
       totals += newAllocationQuantities;
     }
diff --git a/src/tests/sorter_tests.cpp b/src/tests/sorter_tests.cpp
index 64627c4..d7fdee8 100644
--- a/src/tests/sorter_tests.cpp
+++ b/src/tests/sorter_tests.cpp
@@ -1222,6 +1222,22 @@ TYPED_TEST(CommonSorterTest, UpdateAllocation)
   EXPECT_EQ(
       CHECK_NOTERROR(ResourceQuantities::fromString("cpus:10;mem:10;disk:10")),
       sorter.allocationScalarQuantities());
+
+  // Test update allocation to empty.
+  Resources resourcesC =
+    CHECK_NOTERROR(Resources::parse("cpus:10;mem:10;disk:10"));
+
+  SlaveID slaveId2;
+  slaveId.set_value("agentId2");
+
+  sorter.add("c");
+  sorter.activate("c");
+
+  sorter.add(slaveId, resourcesC);
+
+  sorter.allocated("c", slaveId2, resourcesC);
+  sorter.update("c", slaveId2, resourcesC, Resources());
+  EXPECT_TRUE(sorter.allocation("c").empty());
 }