You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@yunikorn.apache.org by GitBox <gi...@apache.org> on 2021/01/12 18:28:59 UTC

[GitHub] [incubator-yunikorn-core] kingamarton commented on a change in pull request #236: [YUNIKORN-476] placeholder allocation and swap

kingamarton commented on a change in pull request #236:
URL: https://github.com/apache/incubator-yunikorn-core/pull/236#discussion_r555946170



##########
File path: pkg/scheduler/objects/application.go
##########
@@ -604,6 +630,121 @@ func (sa *Application) tryAllocate(headRoom *resources.Resource, nodeIterator fu
 	return nil
 }
 
+// Try to replace a placeholder with a real allocation
+func (sa *Application) tryPlaceholderAllocate(nodeIterator func() interfaces.NodeIterator) *Allocation {
+	// cannot allocate if the iterator is not giving us any nodes
+	iterator := nodeIterator()
+	if iterator == nil {
+		return nil
+	}
+	sa.Lock()
+	defer sa.Unlock()
+	// nothing to do if we have no placeholders allocated
+	if resources.IsZero(sa.allocatedPlaceholder) {
+		return nil
+	}
+	// make sure the request are sorted
+	sa.sortRequests(false)
+	// keep the first fits for later
+	var phFit *Allocation
+	var reqFit *AllocationAsk
+	// get all the requests from the app sorted in order
+	for _, request := range sa.sortedRequests {
+		// skip placeholders they follow standard allocation
+		// this should also be part of a task group just make sure it is
+		if request.placeholder || request.taskGroupName == "" {
+			continue
+		}
+		// walk over the placeholders, allow for processing all as we can have multiple task groups
+		phAllocs := sa.getPlaceholderAllocations()
+		for _, ph := range phAllocs {
+			// we could have already released this placeholder and are waiting for the shim to confirm
+			// and check that we have the correct task group before trying to swap
+			if ph.released || request.taskGroupName != ph.taskGroupName {
+				continue
+			}
+			if phFit == nil && reqFit == nil {
+				phFit = ph
+				reqFit = request
+			}
+			// this is inefficient but needs to do for now as we only have the ID not the handle on the node itself
+			var ok bool
+			var node *Node
+			for iterator.HasNext() {
+				node, ok = iterator.Next().(*Node)
+				if !ok {
+					log.Logger().Warn("Node iterator failed to return a node")
+					return nil
+				}
+				if node.NodeID == ph.NodeID {
+					break
+				}
+			}
+			// got the node run same checks as for reservation (all but fits)
+			// resource usage should not change anyway between placeholder and real one
+			if node != nil && node.preReserveConditions(request.AllocationKey) {

Review comment:
       I know that we discussed offline that the shim side should check if the requested are the same for the real pod is the same as for the placeholder, but I think it would worth to add a check here as well.

##########
File path: pkg/scheduler/objects/application.go
##########
@@ -604,6 +630,121 @@ func (sa *Application) tryAllocate(headRoom *resources.Resource, nodeIterator fu
 	return nil
 }
 
+// Try to replace a placeholder with a real allocation
+func (sa *Application) tryPlaceholderAllocate(nodeIterator func() interfaces.NodeIterator) *Allocation {
+	// cannot allocate if the iterator is not giving us any nodes
+	iterator := nodeIterator()
+	if iterator == nil {
+		return nil
+	}
+	sa.Lock()
+	defer sa.Unlock()
+	// nothing to do if we have no placeholders allocated
+	if resources.IsZero(sa.allocatedPlaceholder) {
+		return nil
+	}
+	// make sure the request are sorted
+	sa.sortRequests(false)
+	// keep the first fits for later
+	var phFit *Allocation
+	var reqFit *AllocationAsk
+	// get all the requests from the app sorted in order
+	for _, request := range sa.sortedRequests {
+		// skip placeholders they follow standard allocation
+		// this should also be part of a task group just make sure it is
+		if request.placeholder || request.taskGroupName == "" {
+			continue
+		}
+		// walk over the placeholders, allow for processing all as we can have multiple task groups
+		phAllocs := sa.getPlaceholderAllocations()
+		for _, ph := range phAllocs {
+			// we could have already released this placeholder and are waiting for the shim to confirm
+			// and check that we have the correct task group before trying to swap
+			if ph.released || request.taskGroupName != ph.taskGroupName {
+				continue
+			}
+			if phFit == nil && reqFit == nil {
+				phFit = ph
+				reqFit = request
+			}
+			// this is inefficient but needs to do for now as we only have the ID not the handle on the node itself

Review comment:
       Is it ok that we check only the first fit? (phFit == nil && reqFit == nil). If yes, it makes no sense to iterate it to the other placeholder allocation, because in each iteration the first fit is checked in the next steps as well. If not, then I think we should update phFit and reqFit in each iteration with the new fits. 

##########
File path: pkg/scheduler/partition.go
##########
@@ -704,6 +720,57 @@ func (pc *PartitionContext) tryReservedAllocate() *objects.Allocation {
 	return nil
 }
 
+// Try process placeholder for the partition
+// Lock free call this all locks are taken when needed in called functions
+func (pc *PartitionContext) tryPlaceholderAllocate() *objects.Allocation {
+	if !resources.StrictlyGreaterThanZero(pc.root.GetPendingResource()) {
+		// nothing to do just return
+		return nil
+	}
+	// try allocating from the root down
+	alloc := pc.root.TryPlaceholderAllocate(pc.GetAllNodeIterator)
+	if alloc != nil {
+		return pc.replace(alloc)
+	}
+	return nil
+}
+
+// Process the allocation and make the left over changes in the partition.
+func (pc *PartitionContext) replace(alloc *objects.Allocation) *objects.Allocation {
+	pc.Lock()
+	defer pc.Unlock()
+	// partition is locked nothing can change from now on

Review comment:
       I think this comment line is redundant, since from lines 740 and 741, we already know that the partition is locked and that it cannot change

##########
File path: pkg/scheduler/partition.go
##########
@@ -704,6 +720,57 @@ func (pc *PartitionContext) tryReservedAllocate() *objects.Allocation {
 	return nil
 }
 
+// Try process placeholder for the partition
+// Lock free call this all locks are taken when needed in called functions
+func (pc *PartitionContext) tryPlaceholderAllocate() *objects.Allocation {
+	if !resources.StrictlyGreaterThanZero(pc.root.GetPendingResource()) {
+		// nothing to do just return
+		return nil
+	}
+	// try allocating from the root down
+	alloc := pc.root.TryPlaceholderAllocate(pc.GetAllNodeIterator)
+	if alloc != nil {
+		return pc.replace(alloc)
+	}
+	return nil
+}
+
+// Process the allocation and make the left over changes in the partition.
+func (pc *PartitionContext) replace(alloc *objects.Allocation) *objects.Allocation {
+	pc.Lock()
+	defer pc.Unlock()
+	// partition is locked nothing can change from now on
+	// find the app make sure it still exists
+	appID := alloc.ApplicationID
+	app := pc.applications[appID]
+	if app == nil {
+		log.Logger().Info("Application was removed while allocating",
+			zap.String("appID", appID))
+		return nil
+	}
+	// Safeguard against the unlikely case that we have clashes.
+	// A clash points to entropy issues on the node.
+	if _, found := pc.allocations[alloc.UUID]; found {
+		for {
+			allocationUUID := common.GetNewUUID()
+			log.Logger().Warn("UUID clash, random generator might be lacking entropy",
+				zap.String("uuid", alloc.UUID),
+				zap.String("new UUID", allocationUUID))
+			if pc.allocations[allocationUUID] == nil {
+				alloc.UUID = allocationUUID
+				break
+			}
+		}
+	}

Review comment:
       This is exactly the same that we have in the lines 822-835. Please extract this into a new function and reuse it.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org