You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by mz...@apache.org on 2019/09/23 20:43:10 UTC
[mesos] 03/04: Simplified recover resources when removing
frameworks or agents.
This is an automated email from the ASF dual-hosted git repository.
mzhu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/mesos.git
commit 5849c4af5fc702d9bb366da44f58ae93a2712f3a
Author: Meng Zhu <mz...@mesosphere.io>
AuthorDate: Thu Sep 12 11:46:23 2019 -0700
Simplified recover resources when removing frameworks or agents.
Review: https://reviews.apache.org/r/71476
---
src/master/allocator/mesos/hierarchical.cpp | 85 ++++++++++++++++-------------
src/master/allocator/mesos/hierarchical.hpp | 8 +++
src/master/master.cpp | 12 ----
3 files changed, 55 insertions(+), 50 deletions(-)
diff --git a/src/master/allocator/mesos/hierarchical.cpp b/src/master/allocator/mesos/hierarchical.cpp
index 62750cc..4728154 100644
--- a/src/master/allocator/mesos/hierarchical.cpp
+++ b/src/master/allocator/mesos/hierarchical.cpp
@@ -675,6 +675,15 @@ void HierarchicalAllocatorProcess::removeFramework(
{
CHECK(initialized);
+ // Free up resources on agents if any.
+ foreachvalue (Slave& slave, slaves) {
+ slave.increaseAvailable(
+ frameworkId,
+ slave.getOfferedOrAllocated().get(frameworkId).getOrElse(Resources()));
+ }
+
+ // Update tracking in the role tree and sorters.
+
Framework& framework = *CHECK_NOTNONE(getFramework(frameworkId));
foreach (const string& role, framework.roles) {
@@ -926,11 +935,13 @@ void HierarchicalAllocatorProcess::removeSlave(
{
const Slave& slave = *CHECK_NOTNONE(getSlave(slaveId));
- // TODO(bmahler): Per MESOS-621, this should remove the allocations
- // that any frameworks have on this slave. Otherwise the caller may
- // "leak" allocated resources accidentally if they forget to recover
- // all the resources. Fixing this would require more information
- // than what we currently track in the allocator.
+ // Untrack resources in roleTree and sorter.
+ foreachpair (
+ const FrameworkID& frameworkId,
+ const Resources& resources,
+ slave.getOfferedOrAllocated()) {
+ untrackAllocatedResources(slaveId, frameworkId, resources);
+ }
roleSorter->remove(slaveId, slave.getTotal());
@@ -1459,6 +1470,22 @@ void HierarchicalAllocatorProcess::recoverResources(
return;
}
+ Option<Framework*> framework = getFramework(frameworkId);
+ Option<Slave*> slave = getSlave(slaveId);
+
+ // No work to do if either the framework or the agent no longer exists.
+ //
+ // The framework may not exist if we dispatched Master::offer before we
+ // received MesosAllocatorProcess::removeFramework or
+ // MesosAllocatorProcess::deactivateFramework, in which case we will
+ // have already recovered all of its resources).
+ //
+ // The agent may not exist if we dispatched Master::offer before we
+ // received `removeSlave`.
+ if (framework.isNone() || slave.isNone()) {
+ return;
+ }
+
// For now, we require that resources are recovered within a single
// allocation role (since filtering in the same manner across roles
// seems undesirable).
@@ -1472,41 +1499,26 @@ void HierarchicalAllocatorProcess::recoverResources(
string role = allocations.begin()->first;
- // Updated resources allocated to framework (if framework still
- // exists, which it might not in the event that we dispatched
- // Master::offer before we received
- // MesosAllocatorProcess::removeFramework or
- // MesosAllocatorProcess::deactivateFramework, in which case we will
- // have already recovered all of its resources).
- Option<Framework*> framework = getFramework(frameworkId);
+ // Update resources on the agent.
- if (framework.isSome()) {
- Sorter* frameworkSorter = CHECK_NOTNONE(getFrameworkSorter(role));
+ CHECK((*slave)->getTotalOfferedOrAllocated().contains(resources))
+ << "agent " << slaveId << " resources "
+ << (*slave)->getTotalOfferedOrAllocated() << " do not contain "
+ << resources;
- if (frameworkSorter->contains(frameworkId.value())) {
- untrackAllocatedResources(slaveId, frameworkId, resources);
- }
- }
+ (*slave)->increaseAvailable(frameworkId, resources);
- // Update resources allocated on slave (if slave still exists,
- // which it might not in the event that we dispatched Master::offer
- // before we received Allocator::removeSlave).
- Option<Slave*> slave = getSlave(slaveId);
+ VLOG(1) << "Recovered " << resources << " (total: " << (*slave)->getTotal()
+ << ", offered or allocated: "
+ << (*slave)->getTotalOfferedOrAllocated() << ")"
+ << " on agent " << slaveId << " from framework " << frameworkId;
- if (slave.isSome()) {
- CHECK((*slave)->getTotalOfferedOrAllocated().contains(resources))
- << "agent " << slaveId << " resources "
- << (*slave)->getTotalOfferedOrAllocated() << " do not contain "
- << resources;
+ // Update role tree and sorter.
- (*slave)->increaseAvailable(frameworkId, resources);
+ Sorter* frameworkSorter = CHECK_NOTNONE(getFrameworkSorter(role));
- VLOG(1) << "Recovered " << resources
- << " (total: " << (*slave)->getTotal()
- << ", offered or allocated: "
- << (*slave)->getTotalOfferedOrAllocated() << ")"
- << " on agent " << slaveId
- << " from framework " << frameworkId;
+ if (frameworkSorter->contains(frameworkId.value())) {
+ untrackAllocatedResources(slaveId, frameworkId, resources);
}
// No need to install the filter if 'filters' is none.
@@ -1514,10 +1526,7 @@ void HierarchicalAllocatorProcess::recoverResources(
return;
}
- // No need to install the filter if slave/framework does not exist.
- if (framework.isNone() || slave.isNone()) {
- return;
- }
+ // Update filters.
// Create a refused resources filter.
Try<Duration> timeout = Duration::create(Filters().refuse_seconds());
diff --git a/src/master/allocator/mesos/hierarchical.hpp b/src/master/allocator/mesos/hierarchical.hpp
index 36aa2fc..d42124f 100644
--- a/src/master/allocator/mesos/hierarchical.hpp
+++ b/src/master/allocator/mesos/hierarchical.hpp
@@ -321,8 +321,12 @@ public:
const FrameworkID& frameworkId, const Resources& offeredOrAllocated_)
{
// Increasing available is to subtract offered or allocated.
+ if (offeredOrAllocated_.empty()) {
+ return;
+ }
Resources& resources = offeredOrAllocated.at(frameworkId);
+ CHECK_CONTAINS(resources, offeredOrAllocated_);
resources -= offeredOrAllocated_;
if (resources.empty()) {
offeredOrAllocated.erase(frameworkId);
@@ -336,6 +340,10 @@ public:
void decreaseAvailable(
const FrameworkID& frameworkId, const Resources& offeredOrAllocated_)
{
+ if (offeredOrAllocated_.empty()) {
+ return;
+ }
+
// Decreasing available is to add offered or allocated.
offeredOrAllocated[frameworkId] += offeredOrAllocated_;
diff --git a/src/master/master.cpp b/src/master/master.cpp
index e5bc493..65994aa 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -11640,12 +11640,6 @@ void Master::_removeSlave(
// We want to remove the slave first, to avoid the allocator
// re-allocating the recovered resources.
- //
- // NOTE: Removing the slave is not sufficient for recovering the
- // resources in the allocator, because the "Sorters" are updated
- // only within recoverResources() (see MESOS-621). The calls to
- // recoverResources() below are therefore required, even though
- // the slave is already removed.
allocator->removeSlave(slave->id);
// Transition the tasks to lost and remove them.
@@ -11774,12 +11768,6 @@ void Master::__removeSlave(
{
// We want to remove the slave first, to avoid the allocator
// re-allocating the recovered resources.
- //
- // NOTE: Removing the slave is not sufficient for recovering the
- // resources in the allocator, because the "Sorters" are updated
- // only within recoverResources() (see MESOS-621). The calls to
- // recoverResources() below are therefore required, even though
- // the slave is already removed.
allocator->removeSlave(slave->id);
// Transition tasks to TASK_UNREACHABLE/TASK_GONE_BY_OPERATOR/TASK_LOST