You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by gr...@apache.org on 2017/12/13 00:59:20 UTC
[3/7] mesos git commit: Made master reconcile known offer operations
with agent.
Made master reconcile known offer operations with agent.
In cases where the agent fails over or where an `UpdateSlaveMessage`
races with an `ApplyOfferOperationMessage`, it's possible that the
master knows about an offer operation which is not contained in an
`UpdateSlaveMessage`. In such cases, the master should send a
`ReconcileOfferOperations` message to the agent. The agent will
then respond by sending OFFER_OPERATION_DROPPED status updates for
any operations which it does not know about.
Review: https://reviews.apache.org/r/64464/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/5c91546b
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/5c91546b
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/5c91546b
Branch: refs/heads/master
Commit: 5c91546babf59a42c4e3fc98d5c712e8d1ddd3d3
Parents: 9d7da9b
Author: Greg Mann <gr...@mesosphere.io>
Authored: Tue Dec 12 16:18:41 2017 -0800
Committer: Greg Mann <gr...@gmail.com>
Committed: Tue Dec 12 16:55:47 2017 -0800
----------------------------------------------------------------------
src/master/master.cpp | 50 ++++++++++++++++++++++++++++------------------
1 file changed, 31 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/5c91546b/src/master/master.cpp
----------------------------------------------------------------------
diff --git a/src/master/master.cpp b/src/master/master.cpp
index efe8b8f..806fbc2 100644
--- a/src/master/master.cpp
+++ b/src/master/master.cpp
@@ -7575,6 +7575,8 @@ void Master::updateSlave(const UpdateSlaveMessage& message)
}
}
+ ReconcileOfferOperationsMessage reconcile;
+
// Update master and allocator state.
foreachpair (
const Option<ResourceProviderID>& providerId,
@@ -7637,29 +7639,35 @@ void Master::updateSlave(const UpdateSlaveMessage& message)
} else {
// If this is a known resource provider or agent its total capacity cannot
// have changed, and it would not know about any non-terminal offer
- // operations not already known to the master. It might however have not
- // received an offer operations since the resource provider or agent fell
- // over before the message could be received. We need to remove these
- // operations from our state.
-
- // Reconcile offer operations. This includes recovering
- // resources in used by operations which did not reach the
- // agent or resource provider.
+ // operations not already known to the master. However, it might not have
+ // received an offer operation for a couple different reasons:
+ // - The resource provider or agent could have failed over before the
+ // operation's `ApplyOfferOperationMessage` could be received.
+ // - The operation's `ApplyOfferOperationMessage` could have raced with
+ // this `UpdateSlaveMessage`.
+ //
+ // In both of these cases, we need to reconcile such operations explicitly
+ // with the agent. For operations which the agent or resource provider
+ // does not recognize, an OFFER_OPERATION_DROPPED status update will be
+ // generated and the master will remove the operation from its state upon
+ // receipt of that update.
if (provider.oldOfferOperations.isSome()) {
foreachkey (const UUID& uuid, provider.oldOfferOperations.get()) {
if (provider.newOfferOperations.isNone() ||
!provider.newOfferOperations->contains(uuid)) {
- // TODO(bbannier): Instead of simply dropping an operation with
- // `removeOfferOperation` here we should instead send a `Reconcile`
- // message with a failed state to the agent so its status update
- // manager can reliably deliver the operation status to the
- // framework.
- LOG(WARNING) << "Dropping known offer operation " << uuid.toString()
- << " since it was not present in reconciliation "
- "message from agent";
-
- CHECK(slave->offerOperations.contains(uuid));
- removeOfferOperation(slave->offerOperations.at(uuid));
+ LOG(WARNING) << "Performing explicit reconciliation with agent for"
+ << " known offer operation " << uuid.toString()
+ << " since it was not present in original"
+ << " reconciliation message from agent";
+
+ ReconcileOfferOperationsMessage::Operation* reconcileOperation =
+ reconcile.add_operations();
+ reconcileOperation->set_operation_uuid(uuid.toBytes());
+
+ if (providerId.isSome()) {
+ reconcileOperation->mutable_resource_provider_id()
+ ->CopyFrom(providerId.get());
+ }
}
}
}
@@ -7679,6 +7687,10 @@ void Master::updateSlave(const UpdateSlaveMessage& message)
}
}
+ if (reconcile.operations_size() > 0) {
+ send(slave->pid, reconcile);
+ }
+
// Now update the agent's state and total resources in the allocator.
allocator->updateSlave(slaveId, slave->info, slave->totalResources);