You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@yunikorn.apache.org by GitBox <gi...@apache.org> on 2021/01/12 04:11:20 UTC

[GitHub] [incubator-yunikorn-core] wilfred-s commented on a change in pull request #236: [YUNIKORN-476] placeholder allocation and swap

wilfred-s commented on a change in pull request #236:
URL: https://github.com/apache/incubator-yunikorn-core/pull/236#discussion_r555502337



##########
File path: pkg/scheduler/objects/application.go
##########
@@ -604,6 +630,121 @@ func (sa *Application) tryAllocate(headRoom *resources.Resource, nodeIterator fu
 	return nil
 }
 
+// Try to replace a placeholder with a real allocation
+func (sa *Application) tryPlaceholderAllocate(nodeIterator func() interfaces.NodeIterator) *Allocation {
+	// cannot allocate if the iterator is not giving us any nodes
+	iterator := nodeIterator()
+	if iterator == nil {
+		return nil
+	}
+	sa.Lock()
+	defer sa.Unlock()
+	// nothing to do if we have no placeholders allocated
+	if resources.IsZero(sa.allocatedPlaceholder) {
+		return nil
+	}
+	// make sure the request are sorted
+	sa.sortRequests(false)
+	// keep the first fits for later
+	var phFit *Allocation
+	var reqFit *AllocationAsk
+	// get all the requests from the app sorted in order
+	for _, request := range sa.sortedRequests {
+		// skip placeholders they follow standard allocation
+		// this should also be part of a task group just make sure it is
+		if request.placeholder || request.taskGroupName == "" {
+			continue
+		}
+		// walk over the placeholders, allow for processing all as we can have multiple task groups
+		phAllocs := sa.getPlaceholderAllocations()
+		for _, ph := range phAllocs {
+			// we could have already released this placeholder and are waiting for the shim to confirm
+			// and check that we have the correct task group before trying to swap
+			if ph.released || request.taskGroupName != ph.taskGroupName {
+				continue
+			}
+			if phFit == nil && reqFit == nil {
+				phFit = ph
+				reqFit = request
+			}
+			// this is inefficient but needs to do for now as we only have the ID not the handle on the node itself
+			var ok bool
+			var node *Node
+			for iterator.HasNext() {
+				node, ok = iterator.Next().(*Node)
+				if !ok {
+					log.Logger().Warn("Node iterator failed to return a node")
+					return nil
+				}
+				if node.NodeID == ph.NodeID {
+					break
+				}
+			}
+			// got the node run same checks as for reservation (all but fits)
+			// resource usage should not change anyway between placeholder and real one
+			if node != nil && node.preReserveConditions(request.AllocationKey) {
+				alloc := NewAllocation(common.GetNewUUID(), node.NodeID, request)
+				// double link to make it easier to find
+				alloc.Releases = []*Allocation{ph}
+				ph.Releases = []*Allocation{alloc}
+				alloc.Result = Replaced
+				// mark placeholder as released
+				ph.released = true
+				_, err := sa.updateAskRepeatInternal(request, -1)
+				if err != nil {
+					log.Logger().Warn("ask repeat update failed unexpectedly",
+						zap.Error(err))
+				}
+				return alloc
+			}
+			iterator.Reset()
+		}
+	}
+	// we checked all placeholders and asks nothing worked as yet
+	// pick the first fit and try all nodes if that fails give up
+	if phFit != nil && reqFit != nil {
+		for iterator.HasNext() {
+			node, ok := iterator.Next().(*Node)
+			if !ok {
+				log.Logger().Warn("Node iterator failed to return a node")
+				return nil
+			}
+			if err := node.preAllocateCheck(reqFit.AllocatedResource, reservationKey(nil, sa, reqFit), false); err != nil {
+				continue
+			}
+			// skip the node if conditions can not be satisfied
+			if !node.preAllocateConditions(reqFit.AllocationKey) {
+				continue
+			}
+			// allocation worked: on a non placeholder node update result and return
+			alloc := NewAllocation(common.GetNewUUID(), node.NodeID, reqFit)
+			// double link to make it easier to find
+			alloc.Releases = []*Allocation{phFit}
+			phFit.Releases = []*Allocation{alloc}
+			alloc.Result = Replaced
+			// mark placeholder as released
+			phFit.released = true
+			// update just the node to make sure we keep its spot
+			// no queue update as we're releasing the placeholder and are just temp over the size
+			if !node.AddAllocation(alloc) {
+				log.Logger().Debug("Node update failed unexpectedly",
+					zap.String("applicationID", sa.ApplicationID),
+					zap.String("ask", reqFit.String()),
+					zap.String("placeholder", phFit.String()))
+				return nil

Review comment:
       That would break the gang. If the placeholder is allocated the queue usage is reserved for the gang. When we release the placeholder there is no guarantee that another application in the queue is not going to take that space. This could be an older application or even a newer application.
   It would in (almost) all cases directly break gang scheduling

##########
File path: pkg/scheduler/objects/application.go
##########
@@ -604,6 +630,121 @@ func (sa *Application) tryAllocate(headRoom *resources.Resource, nodeIterator fu
 	return nil
 }
 
+// Try to replace a placeholder with a real allocation
+func (sa *Application) tryPlaceholderAllocate(nodeIterator func() interfaces.NodeIterator) *Allocation {
+	// cannot allocate if the iterator is not giving us any nodes
+	iterator := nodeIterator()
+	if iterator == nil {
+		return nil
+	}
+	sa.Lock()
+	defer sa.Unlock()
+	// nothing to do if we have no placeholders allocated
+	if resources.IsZero(sa.allocatedPlaceholder) {
+		return nil
+	}
+	// make sure the request are sorted
+	sa.sortRequests(false)
+	// keep the first fits for later
+	var phFit *Allocation
+	var reqFit *AllocationAsk
+	// get all the requests from the app sorted in order
+	for _, request := range sa.sortedRequests {
+		// skip placeholders they follow standard allocation
+		// this should also be part of a task group just make sure it is
+		if request.placeholder || request.taskGroupName == "" {
+			continue
+		}
+		// walk over the placeholders, allow for processing all as we can have multiple task groups
+		phAllocs := sa.getPlaceholderAllocations()
+		for _, ph := range phAllocs {
+			// we could have already released this placeholder and are waiting for the shim to confirm
+			// and check that we have the correct task group before trying to swap
+			if ph.released || request.taskGroupName != ph.taskGroupName {
+				continue
+			}
+			if phFit == nil && reqFit == nil {
+				phFit = ph
+				reqFit = request
+			}
+			// this is inefficient but needs to do for now as we only have the ID not the handle on the node itself
+			var ok bool
+			var node *Node
+			for iterator.HasNext() {
+				node, ok = iterator.Next().(*Node)
+				if !ok {
+					log.Logger().Warn("Node iterator failed to return a node")
+					return nil
+				}
+				if node.NodeID == ph.NodeID {
+					break
+				}
+			}
+			// got the node run same checks as for reservation (all but fits)
+			// resource usage should not change anyway between placeholder and real one
+			if node != nil && node.preReserveConditions(request.AllocationKey) {
+				alloc := NewAllocation(common.GetNewUUID(), node.NodeID, request)
+				// double link to make it easier to find
+				alloc.Releases = []*Allocation{ph}
+				ph.Releases = []*Allocation{alloc}
+				alloc.Result = Replaced
+				// mark placeholder as released
+				ph.released = true
+				_, err := sa.updateAskRepeatInternal(request, -1)
+				if err != nil {
+					log.Logger().Warn("ask repeat update failed unexpectedly",
+						zap.Error(err))
+				}
+				return alloc
+			}
+			iterator.Reset()
+		}
+	}
+	// we checked all placeholders and asks nothing worked as yet
+	// pick the first fit and try all nodes if that fails give up
+	if phFit != nil && reqFit != nil {

Review comment:
       We cannot reuse that logic as it does not allow us to link back to the replacement. We also need to be careful with the release if no allocation is made. The queue usage would drop if we release without allocating and the resource could be used by another application in the queue. That would break the gang as it might not be possible o get that back.

##########
File path: pkg/scheduler/objects/application.go
##########
@@ -604,6 +630,121 @@ func (sa *Application) tryAllocate(headRoom *resources.Resource, nodeIterator fu
 	return nil
 }
 
+// Try to replace a placeholder with a real allocation
+func (sa *Application) tryPlaceholderAllocate(nodeIterator func() interfaces.NodeIterator) *Allocation {
+	// cannot allocate if the iterator is not giving us any nodes
+	iterator := nodeIterator()
+	if iterator == nil {
+		return nil
+	}
+	sa.Lock()
+	defer sa.Unlock()
+	// nothing to do if we have no placeholders allocated
+	if resources.IsZero(sa.allocatedPlaceholder) {
+		return nil
+	}
+	// make sure the request are sorted
+	sa.sortRequests(false)
+	// keep the first fits for later
+	var phFit *Allocation
+	var reqFit *AllocationAsk
+	// get all the requests from the app sorted in order
+	for _, request := range sa.sortedRequests {
+		// skip placeholders they follow standard allocation
+		// this should also be part of a task group just make sure it is
+		if request.placeholder || request.taskGroupName == "" {
+			continue
+		}
+		// walk over the placeholders, allow for processing all as we can have multiple task groups
+		phAllocs := sa.getPlaceholderAllocations()
+		for _, ph := range phAllocs {

Review comment:
       Yes we do, we do not know if the placeholder that we get is for the specific TaskGroup. Since we can have multiple task groups on each application we need to find one that matches.
   There is no reason to assume that if one placeholder fails they all fail. The expected cause for failures to allocate the real pod would be that the node predicates fail. This could be a temporary issue or just for that one specific node. Assuming the same state for all nodes is wrong. In the case that all placeholders have issues there is something strange going on.
   We can also not run the normal scheduling cycle (see next review comment).

##########
File path: pkg/scheduler/objects/application.go
##########
@@ -604,6 +630,121 @@ func (sa *Application) tryAllocate(headRoom *resources.Resource, nodeIterator fu
 	return nil
 }
 
+// Try to replace a placeholder with a real allocation
+func (sa *Application) tryPlaceholderAllocate(nodeIterator func() interfaces.NodeIterator) *Allocation {
+	// cannot allocate if the iterator is not giving us any nodes
+	iterator := nodeIterator()
+	if iterator == nil {
+		return nil
+	}
+	sa.Lock()
+	defer sa.Unlock()
+	// nothing to do if we have no placeholders allocated
+	if resources.IsZero(sa.allocatedPlaceholder) {
+		return nil
+	}
+	// make sure the request are sorted
+	sa.sortRequests(false)
+	// keep the first fits for later
+	var phFit *Allocation
+	var reqFit *AllocationAsk
+	// get all the requests from the app sorted in order
+	for _, request := range sa.sortedRequests {
+		// skip placeholders they follow standard allocation
+		// this should also be part of a task group just make sure it is
+		if request.placeholder || request.taskGroupName == "" {
+			continue
+		}
+		// walk over the placeholders, allow for processing all as we can have multiple task groups
+		phAllocs := sa.getPlaceholderAllocations()
+		for _, ph := range phAllocs {
+			// we could have already released this placeholder and are waiting for the shim to confirm
+			// and check that we have the correct task group before trying to swap
+			if ph.released || request.taskGroupName != ph.taskGroupName {
+				continue
+			}
+			if phFit == nil && reqFit == nil {
+				phFit = ph
+				reqFit = request
+			}
+			// this is inefficient but needs to do for now as we only have the ID not the handle on the node itself
+			var ok bool
+			var node *Node
+			for iterator.HasNext() {
+				node, ok = iterator.Next().(*Node)
+				if !ok {
+					log.Logger().Warn("Node iterator failed to return a node")
+					return nil
+				}
+				if node.NodeID == ph.NodeID {
+					break

Review comment:
       This issue resolves itself with the signature change.

##########
File path: pkg/scheduler/objects/application.go
##########
@@ -604,6 +630,121 @@ func (sa *Application) tryAllocate(headRoom *resources.Resource, nodeIterator fu
 	return nil
 }
 
+// Try to replace a placeholder with a real allocation
+func (sa *Application) tryPlaceholderAllocate(nodeIterator func() interfaces.NodeIterator) *Allocation {
+	// cannot allocate if the iterator is not giving us any nodes
+	iterator := nodeIterator()
+	if iterator == nil {
+		return nil
+	}
+	sa.Lock()
+	defer sa.Unlock()
+	// nothing to do if we have no placeholders allocated
+	if resources.IsZero(sa.allocatedPlaceholder) {
+		return nil
+	}
+	// make sure the request are sorted
+	sa.sortRequests(false)
+	// keep the first fits for later
+	var phFit *Allocation
+	var reqFit *AllocationAsk
+	// get all the requests from the app sorted in order
+	for _, request := range sa.sortedRequests {
+		// skip placeholders they follow standard allocation
+		// this should also be part of a task group just make sure it is
+		if request.placeholder || request.taskGroupName == "" {
+			continue
+		}
+		// walk over the placeholders, allow for processing all as we can have multiple task groups
+		phAllocs := sa.getPlaceholderAllocations()
+		for _, ph := range phAllocs {
+			// we could have already released this placeholder and are waiting for the shim to confirm
+			// and check that we have the correct task group before trying to swap
+			if ph.released || request.taskGroupName != ph.taskGroupName {
+				continue
+			}
+			if phFit == nil && reqFit == nil {
+				phFit = ph
+				reqFit = request
+			}
+			// this is inefficient but needs to do for now as we only have the ID not the handle on the node itself

Review comment:
       A new TODO will cause the lint check to fail and thus the build to be marked as failed. That is why I did not add it.
   The solution is rather simple for this issue and would require a change of the function signature to:
   ```
   TryPlaceholderAllocate(nodeIterator func() interfaces.NodeIterator, node func() *objects.Node)
   ```
   that will allow us to just retrieve the node and remove a lot of the iteration. I will push a patch for that.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org