You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@yunikorn.apache.org by GitBox <gi...@apache.org> on 2022/10/06 14:58:24 UTC

[GitHub] [yunikorn-core] manirajv06 opened a new pull request, #441: [YUNIKORN-1328] Handle application state changes and trigger tracker interfaces

manirajv06 opened a new pull request, #441:
URL: https://github.com/apache/yunikorn-core/pull/441

   ### What is this PR for?
   
   Integrated ugm module with application state transition change blocks and other places.
   
   
   ### What type of PR is it?
   * [ ] - Feature
   
   ### Todos
   * [ ] - Task
   
   ### What is the Jira issue?
   https://issues.apache.org/jira/browse/YUNIKORN-1328
   
   ### How should this be tested?
   
   ### Screenshots (if appropriate)
   
   ### Questions:
   * [ ] - The licenses files need update.
   * [ ] - There is breaking changes for older versions.
   * [ ] - It needs documentation.
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@yunikorn.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [yunikorn-core] wilfred-s commented on a diff in pull request #441: [YUNIKORN-1328] Handle application state changes and trigger tracker interfaces

Posted by GitBox <gi...@apache.org>.
wilfred-s commented on code in PR #441:
URL: https://github.com/apache/yunikorn-core/pull/441#discussion_r1015002409


##########
pkg/scheduler/objects/application.go:
##########
@@ -1507,7 +1532,14 @@ func (sa *Application) removeAllocationInternal(uuid string) *Allocation {
 	var eventWarning string
 	// update correct allocation tracker
 	if alloc.IsPlaceholder() {
+		// as and when every ph gets removed (for replacement), resource usage would be reduced.
+		// When real allocation happens as part of replacement, usage would be increased again with real alloc resource
+		removeApp := false
 		sa.allocatedPlaceholder = resources.Sub(sa.allocatedPlaceholder, alloc.GetAllocatedResource())
+		if resources.IsZero(sa.allocatedPlaceholder) {
+			removeApp = true

Review Comment:
   Make sure this is correct: if we call `removeAllocationInternal()` when we time out the placeholders that have not been allocated yet we _might_ remove the app from tracking which is incorrect.
   We need to account for the same kind of cases as below with Fail and Run event.



##########
pkg/scheduler/objects/application.go:
##########
@@ -1507,7 +1532,14 @@ func (sa *Application) removeAllocationInternal(uuid string) *Allocation {
 	var eventWarning string
 	// update correct allocation tracker
 	if alloc.IsPlaceholder() {
+		// as and when every ph gets removed (for replacement), resource usage would be reduced.
+		// When real allocation happens as part of replacement, usage would be increased again with real alloc resource
+		removeApp := false
 		sa.allocatedPlaceholder = resources.Sub(sa.allocatedPlaceholder, alloc.GetAllocatedResource())
+		if resources.IsZero(sa.allocatedPlaceholder) {

Review Comment:
   Same check is happening below, please make merge the change into one check



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@yunikorn.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [yunikorn-core] pbacsko commented on a diff in pull request #441: [YUNIKORN-1328] Handle application state changes and trigger tracker interfaces

Posted by GitBox <gi...@apache.org>.
pbacsko commented on code in PR #441:
URL: https://github.com/apache/yunikorn-core/pull/441#discussion_r1006969154


##########
pkg/scheduler/ugm/manager.go:
##########
@@ -0,0 +1,288 @@
+/*
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+
+package ugm
+
+import (
+	"fmt"
+	"sync"
+
+	"go.uber.org/zap"
+
+	"github.com/apache/yunikorn-core/pkg/common/resources"
+	"github.com/apache/yunikorn-core/pkg/common/security"
+	"github.com/apache/yunikorn-core/pkg/log"
+)
+
+var once sync.Once
+var m *Manager
+
+// Manager implements tracker. A User Group Manager to track the usage for both user and groups.
+// Holds object of both user and group trackers
+type Manager struct {
+	userTrackers  map[string]*UserTracker
+	groupTrackers map[string]*GroupTracker
+	lock          sync.RWMutex
+}
+
+func newManager() *Manager {
+	manager := &Manager{
+		userTrackers:  make(map[string]*UserTracker),
+		groupTrackers: make(map[string]*GroupTracker),
+		lock:          sync.RWMutex{},
+	}
+	return manager
+}
+
+func GetUserManager() *Manager {
+	once.Do(func() {
+		m = newManager()
+	})
+	return m
+}
+
+// IncreaseTrackedResource Increase the resource usage for the given user group and queue path combination.
+// As and when every allocation or asks requests fulfilled on application, corresponding user and group
+// resource usage would be increased against specific application.
+func (m *Manager) IncreaseTrackedResource(queuePath string, applicationID string, usage *resources.Resource, user security.UserGroup) error {
+	log.Logger().Debug("Increasing resource usage", zap.String("user", user.User),
+		zap.String("queue path", queuePath),
+		zap.String("application", applicationID),
+		zap.String("resource", usage.String()))
+	if queuePath == "" || applicationID == "" || usage == nil || user.User == "" {
+		log.Logger().Error("Mandatory parameters are missing to increase the resource usage",
+			zap.String("user", user.User),
+			zap.String("queue path", queuePath),
+			zap.String("application", applicationID),
+			zap.String("resource", usage.String()))
+		return fmt.Errorf("mandatory parameters are missing. queuepath: %s, application id: %s, resource usage: %s, user: %s",
+			queuePath, applicationID, usage.String(), user.User)
+	}
+	m.lock.Lock()
+	defer m.lock.Unlock()
+	var userTracker *UserTracker
+	if m.userTrackers[user.User] == nil {
+		userTracker = newUserTracker(user)
+		m.userTrackers[user.User] = userTracker
+	} else {
+		userTracker = m.userTrackers[user.User]
+	}
+	err := userTracker.increaseTrackedResource(queuePath, applicationID, usage)
+	if err != nil {
+		log.Logger().Error("Problem in increasing the user resource usage",
+			zap.String("user", user.User),
+			zap.String("queue path", queuePath),
+			zap.String("application", applicationID),
+			zap.String("resource", usage.String()),
+			zap.String("err message", err.Error()))
+		return err
+	}
+	if err = m.ensureGroupTrackerForApp(queuePath, applicationID, user); err != nil {
+		return err
+	}
+	group, err := m.getGroup(user)
+	if err != nil {
+		return err
+	}
+	groupTracker := m.groupTrackers[group]
+	if groupTracker != nil {
+		err = groupTracker.increaseTrackedResource(queuePath, applicationID, usage)
+		if err != nil {
+			log.Logger().Error("Problem in increasing the group resource usage",
+				zap.String("user", user.User),
+				zap.String("group", group),
+				zap.String("queue path", queuePath),
+				zap.String("application", applicationID),
+				zap.String("resource", usage.String()),
+				zap.String("err message", err.Error()))
+			return err
+		}
+	}
+	return nil
+}
+
+// DecreaseTrackedResource Decrease the resource usage for the given user group and queue path combination.
+// As and when every allocation or asks release happens, corresponding user and group
+// resource usage would be decreased against specific application. When the final asks release happens, removeApp should be set to true and
+// application itself would be removed from the tracker and no more usage would be tracked further for that specific application.
+func (m *Manager) DecreaseTrackedResource(queuePath string, applicationID string, usage *resources.Resource, user security.UserGroup, removeApp bool) error {
+	log.Logger().Debug("Decreasing resource usage", zap.String("user", user.User),
+		zap.String("queue path", queuePath),
+		zap.String("application", applicationID),
+		zap.String("resource", usage.String()),
+		zap.Bool("removeApp", removeApp))
+	if queuePath == "" || applicationID == "" || usage == nil || user.User == "" {
+		log.Logger().Error("Mandatory parameters are missing to decrease the resource usage",
+			zap.String("user", user.User),
+			zap.String("queue path", queuePath),
+			zap.String("application", applicationID),
+			zap.String("resource", usage.String()),
+			zap.Bool("removeApp", removeApp))
+		return fmt.Errorf("mandatory parameters are missing. queuepath: %s, application id: %s, resource usage: %s, user: %s",
+			queuePath, applicationID, usage.String(), user.User)
+	}
+	m.lock.Lock()
+	defer m.lock.Unlock()
+	userTracker := m.userTrackers[user.User]
+	if userTracker != nil {
+		err := userTracker.decreaseTrackedResource(queuePath, applicationID, usage, removeApp)
+		if err != nil {
+			log.Logger().Error("Problem in decreasing the user resource usage",
+				zap.String("user", user.User),
+				zap.String("queue path", queuePath),
+				zap.String("application", applicationID),
+				zap.String("resource", usage.String()),
+				zap.Bool("removeApp", removeApp),
+				zap.String("err message", err.Error()))
+			return err
+		}
+		if removeApp {
+			if m.isUserRemovable(userTracker) {
+				delete(m.userTrackers, user.User)
+			}
+		}
+	} else {
+		log.Logger().Error("user tracker must be available in userTrackers map",
+			zap.String("user", user.User))
+		return fmt.Errorf("user tracker for %s is missing in userTrackers map", user.User)
+	}
+
+	group, err := m.getGroup(user)
+	if err != nil {
+		return err
+	}
+	groupTracker := m.groupTrackers[group]
+	if groupTracker != nil {
+		err := groupTracker.decreaseTrackedResource(queuePath, applicationID, usage, removeApp)
+		if err != nil {
+			log.Logger().Error("Problem in decreasing the group resource usage",
+				zap.String("user", user.User),
+				zap.String("group", group),
+				zap.String("queue path", queuePath),
+				zap.String("application", applicationID),
+				zap.String("resource", usage.String()),
+				zap.Bool("removeApp", removeApp),
+				zap.String("err message", err.Error()))
+			return err
+		}
+		if removeApp {
+			if m.isGroupRemovable(groupTracker) {
+				delete(m.groupTrackers, group)
+			}
+		}
+	} else {
+		log.Logger().Error("appGroupTrackers tracker must be available in groupTrackers map",
+			zap.String("appGroupTrackers", group))
+		return fmt.Errorf("appGroupTrackers tracker for %s is missing in groupTrackers map", group)
+	}
+	return nil
+}
+
+func (m *Manager) GetUserResources(user security.UserGroup) *resources.Resource {
+	return nil
+}
+
+func (m *Manager) GetGroupResources(group string) *resources.Resource {
+	return nil
+}
+
+func (m *Manager) GetUsersResources() []*UserTracker {
+	return nil
+}
+
+func (m *Manager) GetGroupsResources() []*GroupTracker {
+	return nil
+}
+
+func (m *Manager) ensureGroupTrackerForApp(queuePath string, applicationID string, user security.UserGroup) error {
+	userTracker := m.userTrackers[user.User]
+	if !userTracker.hasGroupForApp(applicationID) {
+		var groupTracker *GroupTracker
+		group, err := m.getGroup(user)
+		if err != nil {
+			return err
+		}
+		if m.groupTrackers[group] == nil {
+			log.Logger().Debug("Group tracker does not exist. Creating group tracker object and linking the same with application",
+				zap.String("application", applicationID),
+				zap.String("queue path", queuePath),
+				zap.String("user", user.User),
+				zap.String("group", group))
+			groupTracker = newGroupTracker(group)
+			m.groupTrackers[group] = groupTracker
+		} else {
+			log.Logger().Debug("Group tracker already exists and linking (reusing) the same with application",
+				zap.String("application", applicationID),
+				zap.String("queue path", queuePath),
+				zap.String("user", user.User),
+				zap.String("group", group))
+			groupTracker = m.groupTrackers[group]
+		}
+		userTracker.setGroupForApp(applicationID, groupTracker)
+	}
+	return nil
+}
+
+// getGroup Based on the current limitations, username and group name is same. Groups[0] is always set and same as username.
+// It would be changed in future based on user group resolution, limit configuration processing etc
+func (m *Manager) getGroup(user security.UserGroup) (string, error) {

Review Comment:
   Question: what if, for whatever reason, we don't have groups for a certain user? Can that happen? Eg. we disable the user-group retrieval in the AC, what happens here?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@yunikorn.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [yunikorn-core] manirajv06 commented on pull request #441: [YUNIKORN-1328] Handle application state changes and trigger tracker interfaces

Posted by GitBox <gi...@apache.org>.
manirajv06 commented on PR #441:
URL: https://github.com/apache/yunikorn-core/pull/441#issuecomment-1331671104

   @wilfred-s and I had one more round of review mostly around simplifying test related helper methods, clean up etc. Incorporated the changes.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@yunikorn.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [yunikorn-core] pbacsko commented on a diff in pull request #441: [YUNIKORN-1328] Handle application state changes and trigger tracker interfaces

Posted by GitBox <gi...@apache.org>.
pbacsko commented on code in PR #441:
URL: https://github.com/apache/yunikorn-core/pull/441#discussion_r1006969154


##########
pkg/scheduler/ugm/manager.go:
##########
@@ -0,0 +1,288 @@
+/*
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+
+package ugm
+
+import (
+	"fmt"
+	"sync"
+
+	"go.uber.org/zap"
+
+	"github.com/apache/yunikorn-core/pkg/common/resources"
+	"github.com/apache/yunikorn-core/pkg/common/security"
+	"github.com/apache/yunikorn-core/pkg/log"
+)
+
+var once sync.Once
+var m *Manager
+
+// Manager implements tracker. A User Group Manager to track the usage for both user and groups.
+// Holds object of both user and group trackers
+type Manager struct {
+	userTrackers  map[string]*UserTracker
+	groupTrackers map[string]*GroupTracker
+	lock          sync.RWMutex
+}
+
+func newManager() *Manager {
+	manager := &Manager{
+		userTrackers:  make(map[string]*UserTracker),
+		groupTrackers: make(map[string]*GroupTracker),
+		lock:          sync.RWMutex{},
+	}
+	return manager
+}
+
+func GetUserManager() *Manager {
+	once.Do(func() {
+		m = newManager()
+	})
+	return m
+}
+
+// IncreaseTrackedResource Increase the resource usage for the given user group and queue path combination.
+// As and when every allocation or asks requests fulfilled on application, corresponding user and group
+// resource usage would be increased against specific application.
+func (m *Manager) IncreaseTrackedResource(queuePath string, applicationID string, usage *resources.Resource, user security.UserGroup) error {
+	log.Logger().Debug("Increasing resource usage", zap.String("user", user.User),
+		zap.String("queue path", queuePath),
+		zap.String("application", applicationID),
+		zap.String("resource", usage.String()))
+	if queuePath == "" || applicationID == "" || usage == nil || user.User == "" {
+		log.Logger().Error("Mandatory parameters are missing to increase the resource usage",
+			zap.String("user", user.User),
+			zap.String("queue path", queuePath),
+			zap.String("application", applicationID),
+			zap.String("resource", usage.String()))
+		return fmt.Errorf("mandatory parameters are missing. queuepath: %s, application id: %s, resource usage: %s, user: %s",
+			queuePath, applicationID, usage.String(), user.User)
+	}
+	m.lock.Lock()
+	defer m.lock.Unlock()
+	var userTracker *UserTracker
+	if m.userTrackers[user.User] == nil {
+		userTracker = newUserTracker(user)
+		m.userTrackers[user.User] = userTracker
+	} else {
+		userTracker = m.userTrackers[user.User]
+	}
+	err := userTracker.increaseTrackedResource(queuePath, applicationID, usage)
+	if err != nil {
+		log.Logger().Error("Problem in increasing the user resource usage",
+			zap.String("user", user.User),
+			zap.String("queue path", queuePath),
+			zap.String("application", applicationID),
+			zap.String("resource", usage.String()),
+			zap.String("err message", err.Error()))
+		return err
+	}
+	if err = m.ensureGroupTrackerForApp(queuePath, applicationID, user); err != nil {
+		return err
+	}
+	group, err := m.getGroup(user)
+	if err != nil {
+		return err
+	}
+	groupTracker := m.groupTrackers[group]
+	if groupTracker != nil {
+		err = groupTracker.increaseTrackedResource(queuePath, applicationID, usage)
+		if err != nil {
+			log.Logger().Error("Problem in increasing the group resource usage",
+				zap.String("user", user.User),
+				zap.String("group", group),
+				zap.String("queue path", queuePath),
+				zap.String("application", applicationID),
+				zap.String("resource", usage.String()),
+				zap.String("err message", err.Error()))
+			return err
+		}
+	}
+	return nil
+}
+
+// DecreaseTrackedResource Decrease the resource usage for the given user group and queue path combination.
+// As and when every allocation or asks release happens, corresponding user and group
+// resource usage would be decreased against specific application. When the final asks release happens, removeApp should be set to true and
+// application itself would be removed from the tracker and no more usage would be tracked further for that specific application.
+func (m *Manager) DecreaseTrackedResource(queuePath string, applicationID string, usage *resources.Resource, user security.UserGroup, removeApp bool) error {
+	log.Logger().Debug("Decreasing resource usage", zap.String("user", user.User),
+		zap.String("queue path", queuePath),
+		zap.String("application", applicationID),
+		zap.String("resource", usage.String()),
+		zap.Bool("removeApp", removeApp))
+	if queuePath == "" || applicationID == "" || usage == nil || user.User == "" {
+		log.Logger().Error("Mandatory parameters are missing to decrease the resource usage",
+			zap.String("user", user.User),
+			zap.String("queue path", queuePath),
+			zap.String("application", applicationID),
+			zap.String("resource", usage.String()),
+			zap.Bool("removeApp", removeApp))
+		return fmt.Errorf("mandatory parameters are missing. queuepath: %s, application id: %s, resource usage: %s, user: %s",
+			queuePath, applicationID, usage.String(), user.User)
+	}
+	m.lock.Lock()
+	defer m.lock.Unlock()
+	userTracker := m.userTrackers[user.User]
+	if userTracker != nil {
+		err := userTracker.decreaseTrackedResource(queuePath, applicationID, usage, removeApp)
+		if err != nil {
+			log.Logger().Error("Problem in decreasing the user resource usage",
+				zap.String("user", user.User),
+				zap.String("queue path", queuePath),
+				zap.String("application", applicationID),
+				zap.String("resource", usage.String()),
+				zap.Bool("removeApp", removeApp),
+				zap.String("err message", err.Error()))
+			return err
+		}
+		if removeApp {
+			if m.isUserRemovable(userTracker) {
+				delete(m.userTrackers, user.User)
+			}
+		}
+	} else {
+		log.Logger().Error("user tracker must be available in userTrackers map",
+			zap.String("user", user.User))
+		return fmt.Errorf("user tracker for %s is missing in userTrackers map", user.User)
+	}
+
+	group, err := m.getGroup(user)
+	if err != nil {
+		return err
+	}
+	groupTracker := m.groupTrackers[group]
+	if groupTracker != nil {
+		err := groupTracker.decreaseTrackedResource(queuePath, applicationID, usage, removeApp)
+		if err != nil {
+			log.Logger().Error("Problem in decreasing the group resource usage",
+				zap.String("user", user.User),
+				zap.String("group", group),
+				zap.String("queue path", queuePath),
+				zap.String("application", applicationID),
+				zap.String("resource", usage.String()),
+				zap.Bool("removeApp", removeApp),
+				zap.String("err message", err.Error()))
+			return err
+		}
+		if removeApp {
+			if m.isGroupRemovable(groupTracker) {
+				delete(m.groupTrackers, group)
+			}
+		}
+	} else {
+		log.Logger().Error("appGroupTrackers tracker must be available in groupTrackers map",
+			zap.String("appGroupTrackers", group))
+		return fmt.Errorf("appGroupTrackers tracker for %s is missing in groupTrackers map", group)
+	}
+	return nil
+}
+
+func (m *Manager) GetUserResources(user security.UserGroup) *resources.Resource {
+	return nil
+}
+
+func (m *Manager) GetGroupResources(group string) *resources.Resource {
+	return nil
+}
+
+func (m *Manager) GetUsersResources() []*UserTracker {
+	return nil
+}
+
+func (m *Manager) GetGroupsResources() []*GroupTracker {
+	return nil
+}
+
+func (m *Manager) ensureGroupTrackerForApp(queuePath string, applicationID string, user security.UserGroup) error {
+	userTracker := m.userTrackers[user.User]
+	if !userTracker.hasGroupForApp(applicationID) {
+		var groupTracker *GroupTracker
+		group, err := m.getGroup(user)
+		if err != nil {
+			return err
+		}
+		if m.groupTrackers[group] == nil {
+			log.Logger().Debug("Group tracker does not exist. Creating group tracker object and linking the same with application",
+				zap.String("application", applicationID),
+				zap.String("queue path", queuePath),
+				zap.String("user", user.User),
+				zap.String("group", group))
+			groupTracker = newGroupTracker(group)
+			m.groupTrackers[group] = groupTracker
+		} else {
+			log.Logger().Debug("Group tracker already exists and linking (reusing) the same with application",
+				zap.String("application", applicationID),
+				zap.String("queue path", queuePath),
+				zap.String("user", user.User),
+				zap.String("group", group))
+			groupTracker = m.groupTrackers[group]
+		}
+		userTracker.setGroupForApp(applicationID, groupTracker)
+	}
+	return nil
+}
+
+// getGroup Based on the current limitations, username and group name is same. Groups[0] is always set and same as username.
+// It would be changed in future based on user group resolution, limit configuration processing etc
+func (m *Manager) getGroup(user security.UserGroup) (string, error) {

Review Comment:
   Question: what if, for whatever reason, we don't have groups for a certain user? Can that happen? Eg. we disable the user-group retrieval in the AC, what happens here? Should we really return an error if there are no groups?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@yunikorn.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [yunikorn-core] wilfred-s closed pull request #441: [YUNIKORN-1328] Handle application state changes and trigger tracker interfaces

Posted by GitBox <gi...@apache.org>.
wilfred-s closed pull request #441: [YUNIKORN-1328] Handle application state changes and trigger tracker interfaces
URL: https://github.com/apache/yunikorn-core/pull/441


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@yunikorn.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [yunikorn-core] wilfred-s commented on pull request #441: [YUNIKORN-1328] Handle application state changes and trigger tracker interfaces

Posted by GitBox <gi...@apache.org>.
wilfred-s commented on PR #441:
URL: https://github.com/apache/yunikorn-core/pull/441#issuecomment-1312854843

   We seem to have a deadlock in the code. One of the tests times out which normally means we have locked ourselves out.
   Can you please check?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@yunikorn.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [yunikorn-core] manirajv06 commented on pull request #441: [YUNIKORN-1328] Handle application state changes and trigger tracker interfaces

Posted by GitBox <gi...@apache.org>.
manirajv06 commented on PR #441:
URL: https://github.com/apache/yunikorn-core/pull/441#issuecomment-1282522353

   **Notes:**
   
   Why increase resource usage in app#AddAllocationInternal instead of enter_starting app transition code block?
   
   1. In case of a gang app, app state would be changed to "starting" only after all ph's have been allocated. But we need to update the user's resource usage during this ph's allocation stage itself.
   2. In case of a non - gang app, after 1st allocation (starting->running), the app would be in running state for future allocations (from 2nd allocation onwards) too. So these future allocations need to be accounted for in the user's resource usage.
   
   Hence, app#AddAllocationInternal is the right place to increase the resource usage. However, With this change, queue metrics and scheduler metrics to increase app running would be moved from enter_running app transition code block to enter_starting app transition code block.
   
   Similarly, app#removeAllocationInternal would be used to decrease the resource usage but not remove the app by calling the decrease method with removeApp as false.
   
   Finally, when the app enters “completing” state, the app would be removed by calling the decrease method with removeApp as true. With this change, even queue metrics and scheduler metrics to do decrease operations would be taken care too.
   
   Even in case of "resuming" (gang app start to work as non gang app because all ph timeouts and gang scheduling fallback style is soft) state, app would be removed as anyway it would be added again when it start to function as non gang app). This activity happens in enter_resuming state transition code block. In case "hard" gang scheduling style, app would be moved to failed state when all ph's timeout happens, hence app would be removed by calling the decrease method with removeApp as true. This activity happens in enter_failing state transition code block. With this change, even queue metrics and scheduler metrics to do decrease operations would be taken care too.
   
   
   **Stack traces of app#AddAllocationInternal**
   
   Node Registration
   
   p#addAllocation -> app#AddAllocation -> app#AddAllocationInternal
   
   Alloc releases
   
   context#processAllocationReleases -> p#removeAllocation -> app#ReplaceAllocation -> app#AddAllocationInternal
   
   Node Removal
   
   p#removeNodeAllocations -> app#ReplaceAllocation -> app#AddAllocationInternal
   
   Regular scheduling core path
   
   app#tryAllocate or app#tryNodes or app#tryNodesNoReserve or app#tryReservedAllocate -> app#AddAllocationInternal
   
   
   **Stack traces of app#removeAllocationInternal**
   
   Normal Alloc releases
   
   context#processAllocationReleases -> p#removeAllocation -> app#RemoveAllocation -> app#removeAllocationInternal
   
   Normal Alloc removal during Node Removal 
   
   p#removeNodeAllocations -> app#RemoveAllocation -> app#removeAllocationInternal
   
   ph Alloc releases
   
   context#processAllocationReleases -> p#removeAllocation -> app#ReplaceAllocation -> 
   app#removeAllocationInternal
   
   Ph Alloc removal during Node Removal
   
   p#removeNodeAllocations -> app#ReplaceAllocation -> 
   app#removeAllocationInternal
   
   
   **User resource usage enforcement and update**
   
   User quota would be enforced in app#tryAllocate just before the queue headroom check to ensure user quota is not exceeded.
   
   if !headRoom.FitInMaxUndef(request.GetAllocatedResource()) {
   }
   
   After these above check, app#tryAllocate calls app#tryNode to Node pre allocate checks, adding allocation into node, incrementing queue allocated resource for resource updates etc and finally calls app#addAllocationInternal. As described above, app#addAllocationInternal does the actual user’s resource usage updates.
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@yunikorn.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [yunikorn-core] manirajv06 commented on pull request #441: [YUNIKORN-1328] Handle application state changes and trigger tracker interfaces

Posted by GitBox <gi...@apache.org>.
manirajv06 commented on PR #441:
URL: https://github.com/apache/yunikorn-core/pull/441#issuecomment-1323242002

   @craigcondit @wilfred-s Cleaned up unwanted old commits (included as part of dependent jira rebase).


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@yunikorn.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [yunikorn-core] codecov[bot] commented on pull request #441: [YUNIKORN-1328] Handle application state changes and trigger tracker interfaces

Posted by GitBox <gi...@apache.org>.
codecov[bot] commented on PR #441:
URL: https://github.com/apache/yunikorn-core/pull/441#issuecomment-1313463832

   # [Codecov](https://codecov.io/gh/apache/yunikorn-core/pull/441?src=pr&el=h1&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=The+Apache+Software+Foundation) Report
   > Merging [#441](https://codecov.io/gh/apache/yunikorn-core/pull/441?src=pr&el=desc&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=The+Apache+Software+Foundation) (875115e) into [master](https://codecov.io/gh/apache/yunikorn-core/commit/1b1a6084e07091e4ca399adeddf84d84d3f8fe82?el=desc&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=The+Apache+Software+Foundation) (1b1a608) will **decrease** coverage by `0.27%`.
   > The diff coverage is `48.23%`.
   
   ```diff
   @@            Coverage Diff             @@
   ##           master     #441      +/-   ##
   ==========================================
   - Coverage   72.65%   72.37%   -0.28%     
   ==========================================
     Files          69       67       -2     
     Lines        9946     9864      -82     
   ==========================================
   - Hits         7226     7139      -87     
   - Misses       2470     2482      +12     
   + Partials      250      243       -7     
   ```
   
   
   | [Impacted Files](https://codecov.io/gh/apache/yunikorn-core/pull/441?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=The+Apache+Software+Foundation) | Coverage Δ | |
   |---|---|---|
   | [pkg/scheduler/ugm/manager.go](https://codecov.io/gh/apache/yunikorn-core/pull/441/diff?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=The+Apache+Software+Foundation#diff-cGtnL3NjaGVkdWxlci91Z20vbWFuYWdlci5nbw==) | `52.40% <10.00%> (-2.65%)` | :arrow_down: |
   | [pkg/scheduler/objects/application.go](https://codecov.io/gh/apache/yunikorn-core/pull/441/diff?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=The+Apache+Software+Foundation#diff-cGtnL3NjaGVkdWxlci9vYmplY3RzL2FwcGxpY2F0aW9uLmdv) | `57.23% <48.27%> (-0.22%)` | :arrow_down: |
   | [pkg/scheduler/partition.go](https://codecov.io/gh/apache/yunikorn-core/pull/441/diff?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=The+Apache+Software+Foundation#diff-cGtnL3NjaGVkdWxlci9wYXJ0aXRpb24uZ28=) | `75.28% <52.17%> (+0.06%)` | :arrow_up: |
   | [pkg/scheduler/objects/application\_state.go](https://codecov.io/gh/apache/yunikorn-core/pull/441/diff?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=The+Apache+Software+Foundation#diff-cGtnL3NjaGVkdWxlci9vYmplY3RzL2FwcGxpY2F0aW9uX3N0YXRlLmdv) | `100.00% <100.00%> (ø)` | |
   | [pkg/common/configs/configs.go](https://codecov.io/gh/apache/yunikorn-core/pull/441/diff?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=The+Apache+Software+Foundation#diff-cGtnL2NvbW1vbi9jb25maWdzL2NvbmZpZ3MuZ28=) | `83.67% <0.00%> (-16.33%)` | :arrow_down: |
   | [pkg/scheduler/context.go](https://codecov.io/gh/apache/yunikorn-core/pull/441/diff?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=The+Apache+Software+Foundation#diff-cGtnL3NjaGVkdWxlci9jb250ZXh0Lmdv) | `29.97% <0.00%> (-1.05%)` | :arrow_down: |
   | [pkg/common/configs/configtestutils.go](https://codecov.io/gh/apache/yunikorn-core/pull/441/diff?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=The+Apache+Software+Foundation#diff-cGtnL2NvbW1vbi9jb25maWdzL2NvbmZpZ3Rlc3R1dGlscy5nbw==) | | |
   | [pkg/common/configs/configwatcher.go](https://codecov.io/gh/apache/yunikorn-core/pull/441/diff?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=The+Apache+Software+Foundation#diff-cGtnL2NvbW1vbi9jb25maWdzL2NvbmZpZ3dhdGNoZXIuZ28=) | | |
   | [pkg/webservice/handlers.go](https://codecov.io/gh/apache/yunikorn-core/pull/441/diff?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=The+Apache+Software+Foundation#diff-cGtnL3dlYnNlcnZpY2UvaGFuZGxlcnMuZ28=) | `76.92% <0.00%> (+0.37%)` | :arrow_up: |
   | ... and [2 more](https://codecov.io/gh/apache/yunikorn-core/pull/441/diff?src=pr&el=tree-more&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=The+Apache+Software+Foundation) | |
   
   :mega: We’re building smart automated test selection to slash your CI/CD build times. [Learn more](https://about.codecov.io/iterative-testing/?utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=The+Apache+Software+Foundation)
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@yunikorn.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [yunikorn-core] craigcondit commented on pull request #441: [YUNIKORN-1328] Handle application state changes and trigger tracker interfaces

Posted by GitBox <gi...@apache.org>.
craigcondit commented on PR #441:
URL: https://github.com/apache/yunikorn-core/pull/441#issuecomment-1319336657

   @manirajv06 this PR seems to have quite a bit of unnecessary bits from other JIRAs included. Can you clean this up and rebase / squash?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@yunikorn.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [yunikorn-core] wilfred-s commented on a diff in pull request #441: [YUNIKORN-1328] Handle application state changes and trigger tracker interfaces

Posted by GitBox <gi...@apache.org>.
wilfred-s commented on code in PR #441:
URL: https://github.com/apache/yunikorn-core/pull/441#discussion_r1030035681


##########
pkg/scheduler/objects/application.go:
##########
@@ -1442,12 +1446,33 @@ func (sa *Application) addAllocationInternal(info *Allocation) {
 					zap.Error(err))
 			}
 		}
+		sa.incUserResourceUsage(info.GetAllocatedResource())
 		sa.allocatedResource = resources.Add(sa.allocatedResource, info.GetAllocatedResource())
 		sa.maxAllocatedResource = resources.ComponentWiseMax(sa.allocatedResource, sa.maxAllocatedResource)
 	}
 	sa.allocations[info.GetUUID()] = info
 }
 
+func (sa *Application) incUserResourceUsage(resource *resources.Resource) {

Review Comment:
   Please add comment on the function and specify if we should hold the lock or not.



##########
pkg/scheduler/objects/application_test.go:
##########
@@ -33,10 +33,17 @@ import (
 	"github.com/apache/yunikorn-core/pkg/handler"
 	"github.com/apache/yunikorn-core/pkg/rmproxy"
 	"github.com/apache/yunikorn-core/pkg/rmproxy/rmevent"
+	"github.com/apache/yunikorn-core/pkg/scheduler/ugm"
 	siCommon "github.com/apache/yunikorn-scheduler-interface/lib/go/common"
 	"github.com/apache/yunikorn-scheduler-interface/lib/go/si"
 )
 
+func setupUGM() {
+	ugm := ugm.GetUserManager()

Review Comment:
   NIT: This variable clashes with an imported package, we should avoid that by choosing a different var name.



##########
pkg/scheduler/objects/application.go:
##########
@@ -1442,12 +1446,33 @@ func (sa *Application) addAllocationInternal(info *Allocation) {
 					zap.Error(err))
 			}
 		}
+		sa.incUserResourceUsage(info.GetAllocatedResource())
 		sa.allocatedResource = resources.Add(sa.allocatedResource, info.GetAllocatedResource())
 		sa.maxAllocatedResource = resources.ComponentWiseMax(sa.allocatedResource, sa.maxAllocatedResource)
 	}
 	sa.allocations[info.GetUUID()] = info
 }
 
+func (sa *Application) incUserResourceUsage(resource *resources.Resource) {
+	if err := ugm.GetUserManager().IncreaseTrackedResource(sa.queuePath, sa.ApplicationID, resource, sa.user); err != nil {
+		log.Logger().Error("Unable to track the user resource usage",
+			zap.String("application id", sa.ApplicationID),
+			zap.String("user", sa.GetUser().User),

Review Comment:
   Why the difference in accessing the user?
   One uses `sa.user` while the other uses an indirect call via `sa.GetUser().User`
   We hold the sa lock direct access should be OK



##########
pkg/scheduler/objects/utilities_test.go:
##########
@@ -94,7 +94,7 @@ func newApplication(appID, partition, queueName string) *Application {
 func newApplicationWithTags(appID, partition, queueName string, tags map[string]string) *Application {
 	user := security.UserGroup{
 		User:   "testuser",
-		Groups: []string{},
+		Groups: []string{"testgroup"},

Review Comment:
   We need to be able to handle an empty group list. The existing tests should still pass without this change.



##########
pkg/scheduler/partition_manager_test.go:
##########
@@ -78,7 +78,7 @@ func TestCleanQueues(t *testing.T) {
 func TestRemoveAll(t *testing.T) {
 	p := createPartitionContext(t)
 
-	_, err := p.createQueue("root.test", security.UserGroup{})
+	_, err := p.createQueue("root.test", security.UserGroup{User: "testuser", Groups: []string{"testgroup"}})

Review Comment:
   The existing tests and calls should not fail with an empty security object



##########
pkg/scheduler/ugm/manager.go:
##########
@@ -279,10 +285,18 @@ func (m *Manager) isGroupRemovable(gt *GroupTracker) bool {
 }
 
 // getUserTrackers only for tests
-func (m *Manager) getUserTrackers() map[string]*UserTracker {
+func (m *Manager) GetUserTrackers() map[string]*UserTracker {

Review Comment:
   Even in test this could cause race conditions. We need to be careful with this.



##########
pkg/scheduler/partition_test.go:
##########
@@ -251,9 +270,23 @@ func TestAddNodeWithAllocations(t *testing.T) {
 
 	// check the queue usage
 	assert.Assert(t, resources.Equals(q.GetAllocatedResource(), appRes), "add node to partition did not update queue as expected")
+	assertUserGroupResource(t, appRes)
+}
+
+func assertUserGroupNilResourceWithError(t *testing.T) {

Review Comment:
   Keep this with the other assertUserGroupResource()( function and the same comments for this one as for the one in objecs...



##########
pkg/scheduler/objects/application.go:
##########
@@ -1442,12 +1446,33 @@ func (sa *Application) addAllocationInternal(info *Allocation) {
 					zap.Error(err))
 			}
 		}
+		sa.incUserResourceUsage(info.GetAllocatedResource())
 		sa.allocatedResource = resources.Add(sa.allocatedResource, info.GetAllocatedResource())
 		sa.maxAllocatedResource = resources.ComponentWiseMax(sa.allocatedResource, sa.maxAllocatedResource)
 	}
 	sa.allocations[info.GetUUID()] = info
 }
 
+func (sa *Application) incUserResourceUsage(resource *resources.Resource) {
+	if err := ugm.GetUserManager().IncreaseTrackedResource(sa.queuePath, sa.ApplicationID, resource, sa.user); err != nil {
+		log.Logger().Error("Unable to track the user resource usage",
+			zap.String("application id", sa.ApplicationID),
+			zap.String("user", sa.GetUser().User),
+			zap.String("currentState", sa.stateMachine.Current()),
+			zap.Error(err))
+	}
+}
+
+func (sa *Application) decUserResourceUsage(resource *resources.Resource, removeApp bool) {
+	if err := ugm.GetUserManager().DecreaseTrackedResource(sa.queuePath, sa.ApplicationID, resource, sa.user, removeApp); err != nil {
+		log.Logger().Error("Unable to track the user resource usage",
+			zap.String("application id", sa.ApplicationID),
+			zap.String("user", sa.GetUser().User),

Review Comment:
   See earlier comment on difference in access



##########
pkg/scheduler/objects/application.go:
##########
@@ -1442,12 +1446,33 @@ func (sa *Application) addAllocationInternal(info *Allocation) {
 					zap.Error(err))
 			}
 		}
+		sa.incUserResourceUsage(info.GetAllocatedResource())
 		sa.allocatedResource = resources.Add(sa.allocatedResource, info.GetAllocatedResource())
 		sa.maxAllocatedResource = resources.ComponentWiseMax(sa.allocatedResource, sa.maxAllocatedResource)
 	}
 	sa.allocations[info.GetUUID()] = info
 }
 
+func (sa *Application) incUserResourceUsage(resource *resources.Resource) {
+	if err := ugm.GetUserManager().IncreaseTrackedResource(sa.queuePath, sa.ApplicationID, resource, sa.user); err != nil {
+		log.Logger().Error("Unable to track the user resource usage",
+			zap.String("application id", sa.ApplicationID),
+			zap.String("user", sa.GetUser().User),
+			zap.String("currentState", sa.stateMachine.Current()),
+			zap.Error(err))
+	}
+}
+
+func (sa *Application) decUserResourceUsage(resource *resources.Resource, removeApp bool) {

Review Comment:
   Please add comment on the function and specify if we should hold the lock or not.



##########
pkg/scheduler/objects/application_test.go:
##########
@@ -1382,6 +1441,52 @@ func TestAppTimersAfterAppRemoval(t *testing.T) {
 	}
 }
 
+func TestIncAndDecUserResourceUsage(t *testing.T) {
+	res := resources.NewResourceFromMap(map[string]resources.Quantity{"first": 5})
+	app := newApplication(appID1, "default", "root.unknown")
+	if app == nil || app.ApplicationID != appID1 {
+		t.Fatalf("app create failed which should not have %v", app)
+	}
+	queue, err := createRootQueue(nil)
+	assert.NilError(t, err, "queue create failed")
+	app.queue = queue
+	assertUserGroupNilResourceWithError(t)
+	app.incUserResourceUsage(nil)
+	assertUserGroupNilResourceWithError(t)
+	app.incUserResourceUsage(res)
+	assertUserGroupResource(t, res)
+	app.incUserResourceUsage(res)
+	assertUserGroupResource(t, resources.Multiply(res, 2))
+	app.decUserResourceUsage(nil, false)
+	assertUserGroupResource(t, resources.Multiply(res, 2))
+	app.decUserResourceUsage(res, false)
+	assertUserGroupResource(t, res)
+	app.decUserResourceUsage(res, false)
+	assertUserGroupResource(t, resources.NewResourceFromMap(map[string]resources.Quantity{"first": 0}))
+}
+
+func assertUserGroupNilResourceWithError(t *testing.T) {
+	ugm := ugm.GetUserManager()
+	userResource, err := ugm.GetUserResources(security.UserGroup{User: "testuser", Groups: []string{"testgroup"}})
+	assert.Error(t, err, "user testuser is not available in user trackers map")
+	groupResource, err := ugm.GetGroupResources("testgroup")

Review Comment:
   We should not hardcode these functions on the testuser and testgroup. Make them re-usable, testing with multiple users and groups should be able to use the same functions. Moving them to `utilities_test.go` also helps with that. 
   * allow passing in a `security.UserGroup` object
   * create a wrapper that creates a testuser/group `security.UserGroup` object and passes it in
   
   We need a follow jira for added testing: to make sure multiple users with the same group trigger the correct updates etc.



##########
pkg/scheduler/partition.go:
##########
@@ -395,30 +398,32 @@ func (pc *PartitionContext) removeApplication(appID string) []*objects.Allocatio
 		queue.RemoveApplication(app)
 	}
 	// Remove all allocations
-	allocations := app.RemoveAllAllocations()
-	// Remove all allocations from node(s) (queues have been updated already)
-	if len(allocations) != 0 {
-		// track the number of allocations
-		pc.updateAllocationCount(-len(allocations))
-		for _, alloc := range allocations {
-			currentUUID := alloc.GetUUID()
-			node := pc.GetNode(alloc.GetNodeID())
-			if node == nil {
-				log.Logger().Warn("unknown node: not found in active node list",
-					zap.String("appID", appID),
-					zap.String("nodeID", alloc.GetNodeID()))
-				continue
-			}
-			if nodeAlloc := node.RemoveAllocation(currentUUID); nodeAlloc == nil {
-				log.Logger().Warn("unknown allocation: not found on the node",
-					zap.String("appID", appID),
-					zap.String("allocationId", currentUUID),
-					zap.String("nodeID", alloc.GetNodeID()))
+	if len(app.GetAllAllocations()) > 0 {

Review Comment:
   This is a slow down of the code. It copies the allocations and immediately dumps the result. The caller of `removeApplication` handles  a zero length slice the same way as it does a nil. It also does not trigger the app state change.
   This change should be reverted



##########
pkg/scheduler/partition_test.go:
##########
@@ -36,10 +36,27 @@ import (
 	"github.com/apache/yunikorn-core/pkg/rmproxy/rmevent"
 	"github.com/apache/yunikorn-core/pkg/scheduler/objects"
 	"github.com/apache/yunikorn-core/pkg/scheduler/policies"
+	"github.com/apache/yunikorn-core/pkg/scheduler/ugm"
 	siCommon "github.com/apache/yunikorn-scheduler-interface/lib/go/common"
 	"github.com/apache/yunikorn-scheduler-interface/lib/go/si"
 )
 
+func setupUGM() {
+	ugm := ugm.GetUserManager()

Review Comment:
   NIT: This variable clashes with an imported package, we should avoid that by choosing a different var name.



##########
pkg/scheduler/partition.go:
##########
@@ -63,6 +64,7 @@ type PartitionContext struct {
 	totalPartitionResource *resources.Resource             // Total node resources
 	allocations            int                             // Number of allocations on the partition
 	stateDumpFilePath      string                          // Path of output file for state dumps
+	ugm                    *ugm.Manager                    // User group manager

Review Comment:
   NIT: This variable clashes with an imported package, we should avoid that by choosing a different var name.



##########
pkg/scheduler/ugm/manager.go:
##########
@@ -193,20 +193,26 @@ func (m *Manager) DecreaseTrackedResource(queuePath string, applicationID string
 	return nil
 }
 
-func (m *Manager) GetUserResources(user security.UserGroup) *resources.Resource {
-	return nil
+func (m *Manager) GetUserResources(user security.UserGroup) (*resources.Resource, error) {
+	if m.userTrackers[user.User] != nil {
+		return m.userTrackers[user.User].queueTracker.resourceUsage, nil
+	}
+	return nil, fmt.Errorf("user %s is not available in user trackers map", user.User)
 }
 
-func (m *Manager) GetGroupResources(group string) *resources.Resource {
-	return nil
+func (m *Manager) GetGroupResources(group string) (*resources.Resource, error) {
+	if m.groupTrackers[group] != nil {
+		return m.groupTrackers[group].queueTracker.resourceUsage, nil
+	}
+	return nil, fmt.Errorf("group %s is not available in group trackers map", group)
 }
 
-func (m *Manager) GetUsersResources() []*UserTracker {
-	return nil
+func (m *Manager) GetUsersResources() map[string]*UserTracker {
+	return m.userTrackers

Review Comment:
   This could cause race conditions. The caller gets that map and can manipulate the map.
   maps are not concurrency safe and need accessing under lock. We should pass back a copy of the content.
   That is why the `[]*userTracker` was chosen



##########
pkg/scheduler/ugm/manager.go:
##########
@@ -279,10 +285,18 @@ func (m *Manager) isGroupRemovable(gt *GroupTracker) bool {
 }
 
 // getUserTrackers only for tests
-func (m *Manager) getUserTrackers() map[string]*UserTracker {
+func (m *Manager) GetUserTrackers() map[string]*UserTracker {
 	return m.userTrackers
 }
 
-func (m *Manager) getGroupTrackers() map[string]*GroupTracker {
+func (m *Manager) GetGroupTrackers() map[string]*GroupTracker {
 	return m.groupTrackers
 }
+
+func (m *Manager) ClearUserTrackers() {
+	m.userTrackers = make(map[string]*UserTracker)
+}
+
+func (m *Manager) ClearGroupTrackers() {

Review Comment:
   race condition needs to happen under lock



##########
pkg/webservice/handlers_test.go:
##########
@@ -468,7 +468,7 @@ func TestGetClusterUtilJSON(t *testing.T) {
 	assert.Equal(t, partitionName, partition.Name)
 	// new app to partition
 	appID := "appID-1"
-	app := newApplication(appID, partitionName, queueName, rmID, security.UserGroup{})
+	app := newApplication(appID, partitionName, queueName, rmID, security.UserGroup{User: "testuser", Groups: []string{"testgroup"}})

Review Comment:
   `newApplication` is a wrapper defined in this file. A simpler change would be to check if and empty `security.UserGroup{}` is passed in in that implementation and then override with a test user.



##########
pkg/scheduler/ugm/tracker.go:
##########
@@ -28,8 +28,8 @@ type Tracker interface {
 	GetUserResources(user security.UserGroup) *resources.Resource
 	GetGroupResources(group string) *resources.Resource
 
-	GetUsersResources() []*UserTracker
-	GetGroupsResources() []*GroupTracker
+	GetUsersResources() map[string]*UserTracker
+	GetGroupsResources() map[string]*GroupTracker

Review Comment:
   This is a deviation of the design. Why do we need to have a map instead of a slice?



##########
pkg/scheduler/ugm/manager.go:
##########
@@ -193,20 +193,26 @@ func (m *Manager) DecreaseTrackedResource(queuePath string, applicationID string
 	return nil
 }
 
-func (m *Manager) GetUserResources(user security.UserGroup) *resources.Resource {
-	return nil
+func (m *Manager) GetUserResources(user security.UserGroup) (*resources.Resource, error) {
+	if m.userTrackers[user.User] != nil {

Review Comment:
   map are not concurrent safe access needs to be under lock



##########
pkg/scheduler/partition_test.go:
##########
@@ -2007,21 +2100,28 @@ func TestPlaceholderBiggerThanReal(t *testing.T) {
 	assert.Equal(t, 1, partition.GetTotalAllocationCount(), "real allocation should be registered on the partition")
 	assert.Assert(t, resources.Equals(smallRes, app.GetQueue().GetAllocatedResource()), "real size should be allocated on queue")
 	assert.Assert(t, resources.Equals(smallRes, node.GetAllocatedResource()), "real size should be allocated on node")
+	assertUserGroupResource(t, smallRes)
+}
+
+func assertUserGroupResource(t *testing.T, expected *resources.Resource) {
+	ugm := ugm.GetUserManager()
+	userResource, err := ugm.GetUserResources(security.UserGroup{User: "testuser", Groups: []string{"testgroup"}})
+	assert.NilError(t, err, "User should have been tracked by this time and available in user trackers map")
+	groupResource, err := ugm.GetGroupResources("testgroup")
+	assert.NilError(t, err, "Group should have been tracked by this time and available in group trackers map")
+	assert.Equal(t, userResource.String(), expected.String())
+	assert.Equal(t, groupResource.String(), expected.String())

Review Comment:
   Same as for the other instance of this check we need to make it user independent, and move it to utilities_test.go



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@yunikorn.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org