You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by sn...@apache.org on 2022/03/10 21:23:15 UTC

[hadoop] branch trunk updated: YARN-11067. Resource overcommitment due to incorrect resource normalisation logical order. Contributed by Andras Gyori

This is an automated email from the ASF dual-hosted git repository.

snemeth pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/trunk by this push:
     new ed65aa2  YARN-11067. Resource overcommitment due to incorrect resource normalisation logical order. Contributed by Andras Gyori
ed65aa2 is described below

commit ed65aa23240b3dd6b56e86e5f0e9d38069fb3b01
Author: Szilard Nemeth <sn...@apache.org>
AuthorDate: Thu Mar 10 22:22:58 2022 +0100

    YARN-11067. Resource overcommitment due to incorrect resource normalisation logical order. Contributed by Andras Gyori
---
 .../scheduler/capacity/ParentQueue.java            | 37 ++++++++---------
 .../TestAbsoluteResourceConfiguration.java         | 47 ++++++++++++++++++++++
 2 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java
index c624aab..87ebc0b 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java
@@ -1294,17 +1294,24 @@ public class ParentQueue extends AbstractCSQueue {
 
   private void calculateEffectiveResourcesAndCapacity(String label,
       Resource clusterResource) {
+    // Update effective resources for my self;
+    if (rootQueue) {
+      Resource resourceByLabel = labelManager.getResourceByLabel(label, clusterResource);
+      usageTracker.getQueueResourceQuotas().setEffectiveMinResource(label, resourceByLabel);
+      usageTracker.getQueueResourceQuotas().setEffectiveMaxResource(label, resourceByLabel);
+    } else {
+      super.updateEffectiveResources(clusterResource);
+    }
+
+    recalculateEffectiveMinRatio(label, clusterResource);
+  }
+
+  private void recalculateEffectiveMinRatio(String label, Resource clusterResource) {
     // For root queue, ensure that max/min resource is updated to latest
     // cluster resource.
-    Resource resourceByLabel = labelManager.getResourceByLabel(label,
-        clusterResource);
-
-    /*
-     * == Below logic are added to calculate effectiveMinRatioPerResource ==
-     */
+    Resource resourceByLabel = labelManager.getResourceByLabel(label, clusterResource);
 
-    // Total configured min resources of direct children of this given parent
-    // queue
+    // Total configured min resources of direct children of this given parent queue
     Resource configuredMinResources = Resource.newInstance(0L, 0);
     for (CSQueue childQueue : getChildQueues()) {
       Resources.addTo(configuredMinResources,
@@ -1312,8 +1319,7 @@ public class ParentQueue extends AbstractCSQueue {
     }
 
     // Factor to scale down effective resource: When cluster has sufficient
-    // resources, effective_min_resources will be same as configured
-    // min_resources.
+    // resources, effective_min_resources will be same as configured min_resources.
     Resource numeratorForMinRatio = null;
     if (getQueuePath().equals("root")) {
       if (!resourceByLabel.equals(Resources.none()) && Resources.lessThan(resourceCalculator,
@@ -1324,21 +1330,12 @@ public class ParentQueue extends AbstractCSQueue {
       if (Resources.lessThan(resourceCalculator, clusterResource,
           usageTracker.getQueueResourceQuotas().getEffectiveMinResource(label),
           configuredMinResources)) {
-        numeratorForMinRatio = usageTracker.getQueueResourceQuotas()
-            .getEffectiveMinResource(label);
+        numeratorForMinRatio = usageTracker.getQueueResourceQuotas().getEffectiveMinResource(label);
       }
     }
 
     effectiveMinResourceRatio.put(label, getEffectiveMinRatio(
         configuredMinResources, numeratorForMinRatio));
-
-    // Update effective resources for my self;
-    if (rootQueue) {
-      usageTracker.getQueueResourceQuotas().setEffectiveMinResource(label, resourceByLabel);
-      usageTracker.getQueueResourceQuotas().setEffectiveMaxResource(label, resourceByLabel);
-    } else{
-      super.updateEffectiveResources(clusterResource);
-    }
   }
 
   private Map<String, Float> getEffectiveMinRatio(
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestAbsoluteResourceConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestAbsoluteResourceConfiguration.java
index d7c80b5..8d68cbf 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestAbsoluteResourceConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestAbsoluteResourceConfiguration.java
@@ -30,6 +30,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
 import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent;
 import org.apache.hadoop.yarn.util.resource.Resources;
 import org.junit.Assert;
 import org.junit.Test;
@@ -100,6 +101,21 @@ public class TestAbsoluteResourceConfiguration {
   private static Set<String> resourceTypes = new HashSet<>(
       Arrays.asList("memory", "vcores"));
 
+  private CapacitySchedulerConfiguration setupNormalizationConfiguration() {
+    CapacitySchedulerConfiguration csConf = new CapacitySchedulerConfiguration();
+    csConf.setQueues(CapacitySchedulerConfiguration.ROOT,
+        new String[]{QUEUEA, QUEUEB});
+    csConf.setQueues(QUEUEA_FULL.getFullPath(), new String[]{QUEUEA1, QUEUEA2});
+
+//    60, 28
+    csConf.setMinimumResourceRequirement("", QUEUEA_FULL, Resource.newInstance(50 * GB, 20));
+    csConf.setMinimumResourceRequirement("", QUEUEA1_FULL, Resource.newInstance(30 * GB, 15));
+    csConf.setMinimumResourceRequirement("", QUEUEA2_FULL, Resource.newInstance(20 * GB, 5));
+    csConf.setMinimumResourceRequirement("", QUEUEB_FULL, Resource.newInstance(10 * GB, 8));
+
+    return csConf;
+  }
+
   private CapacitySchedulerConfiguration setupSimpleQueueConfiguration(
       boolean isCapacityNeeded) {
     CapacitySchedulerConfiguration csConf = new CapacitySchedulerConfiguration();
@@ -293,6 +309,37 @@ public class TestAbsoluteResourceConfiguration {
   }
 
   @Test
+  public void testNormalizationAfterNodeRemoval() throws Exception {
+    CapacitySchedulerConfiguration csConf = setupNormalizationConfiguration();
+    csConf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class,
+        ResourceScheduler.class);
+
+    MockRM rm = new MockRM(csConf);
+
+    rm.start();
+    rm.registerNode("h1:1234", 8 * GB, 4);
+    rm.registerNode("h2:1234", 8 * GB, 4);
+    rm.registerNode("h3:1234", 8 * GB, 4);
+    MockNM nm = rm.registerNode("h4:1234", 8 * GB, 4);
+    rm.registerNode("h5:1234", 28 * GB, 12);
+
+    // Send a removal event to CS. MockRM#unregisterNode does not reflect the real world scenario,
+    // therefore we manually need to invoke this removal event.
+    CapacityScheduler cs = (CapacityScheduler) rm.getResourceScheduler();
+    cs.handle(new NodeRemovedSchedulerEvent(rm.getRMContext().getRMNodes().get(nm.getNodeId())));
+
+    Resource res = Resources.add(
+        cs.getQueue(QUEUEA1_FULL.getFullPath()).getEffectiveCapacity(""),
+        cs.getQueue(QUEUEA2_FULL.getFullPath()).getEffectiveCapacity(""));
+    Resource resParent = cs.getQueue(QUEUEA_FULL.getFullPath()).getEffectiveCapacity("");
+
+    // Check if there is no overcommitment on behalf of the child queues
+    Assert.assertTrue(String.format("Summarized resource %s of all children is greater than " +
+        "their parent's %s", res, resParent),
+        Resources.lessThan(cs.getResourceCalculator(), cs.getClusterResource(), res, resParent));
+  }
+
+  @Test
   public void testEffectiveMinMaxResourceConfigurartionPerQueue()
       throws Exception {
     // create conf with basic queue configuration.

---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org