You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ta...@apache.org on 2019/11/22 08:07:02 UTC

[hadoop] branch trunk updated: YARN-9838. Fix resource inconsistency for queues when moving app with reserved container to another queue. Contributed by jiulongzhu.

This is an automated email from the ASF dual-hosted git repository.

taoyang pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 8afabe4  YARN-9838. Fix resource inconsistency for queues when moving app with reserved container to another queue. Contributed by jiulongzhu.
8afabe4 is described below

commit 8afabe41e4175551ae71217a45f7c4276f10b7da
Author: Tao Yang <ya...@alibaba-inc.com>
AuthorDate: Fri Nov 22 16:06:14 2019 +0800

    YARN-9838. Fix resource inconsistency for queues when moving app with reserved container to another queue. Contributed by jiulongzhu.
---
 .../scheduler/capacity/CapacityScheduler.java      |  5 ++
 .../scheduler/capacity/LeafQueue.java              |  2 +
 .../scheduler/capacity/TestCapacityScheduler.java  | 90 ++++++++++++++++++++++
 3 files changed, 97 insertions(+)

diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
index 2121a1f..7611bb6 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
@@ -2472,6 +2472,11 @@ public class CapacityScheduler extends
           // attach the Container to another queue
           dest.attachContainer(getClusterResource(), app, rmContainer);
         }
+        // Move all reserved containers
+        for (RMContainer rmContainer : app.getReservedContainers()) {
+          source.detachContainer(getClusterResource(), app, rmContainer);
+          dest.attachContainer(getClusterResource(), app, rmContainer);
+        }
         if (!app.isStopped()) {
           source.finishApplicationAttempt(app, sourceQueueName);
           // Submit to a new queue
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java
index b883a9a..1028a7d 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java
@@ -2020,6 +2020,7 @@ public class LeafQueue extends AbstractCSQueue {
       allocateResource(clusterResource, application, rmContainer.getContainer()
           .getResource(), node.getPartition(), rmContainer);
       LOG.info("movedContainer" + " container=" + rmContainer.getContainer()
+          + " containerState="+ rmContainer.getState()
           + " resource=" + rmContainer.getContainer().getResource()
           + " queueMoveIn=" + this + " usedCapacity=" + getUsedCapacity()
           + " absoluteUsedCapacity=" + getAbsoluteUsedCapacity() + " used="
@@ -2039,6 +2040,7 @@ public class LeafQueue extends AbstractCSQueue {
       releaseResource(clusterResource, application, rmContainer.getContainer()
           .getResource(), node.getPartition(), rmContainer);
       LOG.info("movedContainer" + " container=" + rmContainer.getContainer()
+          + " containerState="+ rmContainer.getState()
           + " resource=" + rmContainer.getContainer().getResource()
           + " queueMoveOut=" + this + " usedCapacity=" + getUsedCapacity()
           + " absoluteUsedCapacity=" + getAbsoluteUsedCapacity() + " used="
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java
index 01be51c..e8bada8 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java
@@ -5540,4 +5540,94 @@ public class TestCapacityScheduler extends CapacitySchedulerTestBase {
     assertEquals(51200, ((CSQueueMetrics)cs.getQueue("a2").getMetrics()).getMaxCapacityMB());
     assertEquals(25600, ((CSQueueMetrics)cs.getQueue("a3").getMetrics()).getMaxCapacityMB());
   }
+
+  @Test
+  public void testReservedContainerLeakWhenMoveApplication() throws Exception {
+    CapacitySchedulerConfiguration csConf
+        = new CapacitySchedulerConfiguration();
+    csConf.setQueues(CapacitySchedulerConfiguration.ROOT,
+        new String[] {"a", "b"});
+    csConf.setCapacity("root.a", 50);
+    csConf.setMaximumCapacity("root.a", 100);
+    csConf.setUserLimitFactor("root.a", 100);
+    csConf.setCapacity("root.b", 50);
+    csConf.setMaximumCapacity("root.b", 100);
+    csConf.setUserLimitFactor("root.b", 100);
+
+    YarnConfiguration conf=new YarnConfiguration(csConf);
+    conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class,
+        ResourceScheduler.class);
+    RMNodeLabelsManager mgr=new NullRMNodeLabelsManager();
+    mgr.init(conf);
+    MockRM rm1 = new MockRM(csConf);
+    CapacityScheduler scheduler=(CapacityScheduler) rm1.getResourceScheduler();
+    rm1.getRMContext().setNodeLabelManager(mgr);
+    rm1.start();
+    MockNM nm1 = rm1.registerNode("127.0.0.1:1234", 8 * GB);
+    MockNM nm2 = rm1.registerNode("127.0.0.2:1234", 8 * GB);
+    /*
+     * simulation
+     * app1: (1 AM,1 running container)
+     * app2: (1 AM,1 reserved container)
+     */
+    // launch an app to queue, AM container should be launched in nm1
+    RMApp app1 = rm1.submitApp(1 * GB, "app_1", "user_1", null, "a");
+    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+
+    // launch another app to queue, AM container should be launched in nm1
+    RMApp app2 = rm1.submitApp(1 * GB, "app_2", "user_1", null, "a");
+    MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, nm1);
+
+    am1.allocate("*", 4 * GB, 1, new ArrayList<ContainerId>());
+    // this containerRequest should be reserved
+    am2.allocate("*", 4 * GB, 1, new ArrayList<ContainerId>());
+
+    RMNode rmNode1 = rm1.getRMContext().getRMNodes().get(nm1.getNodeId());
+    // Do node heartbeats 2 times
+    // First time will allocate container for app1, second time will reserve
+    // container for app2
+    scheduler.handle(new NodeUpdateSchedulerEvent(rmNode1));
+    scheduler.handle(new NodeUpdateSchedulerEvent(rmNode1));
+
+    FiCaSchedulerApp schedulerApp1 =
+        scheduler.getApplicationAttempt(am1.getApplicationAttemptId());
+    FiCaSchedulerApp schedulerApp2 =
+        scheduler.getApplicationAttempt(am2.getApplicationAttemptId());
+    // APP1:  1 AM, 1 allocatedContainer
+    Assert.assertEquals(2, schedulerApp1.getLiveContainers().size());
+    // APP2:  1 AM,1 reservedContainer
+    Assert.assertEquals(1, schedulerApp2.getLiveContainers().size());
+    Assert.assertEquals(1, schedulerApp2.getReservedContainers().size());
+    //before,move app2 which has one reservedContainer
+    LeafQueue srcQueue = (LeafQueue) scheduler.getQueue("a");
+    LeafQueue desQueue = (LeafQueue) scheduler.getQueue("b");
+    Assert.assertEquals(4, srcQueue.getNumContainers());
+    Assert.assertEquals(10*GB, srcQueue.getUsedResources().getMemorySize());
+    Assert.assertEquals(0, desQueue.getNumContainers());
+    Assert.assertEquals(0, desQueue.getUsedResources().getMemorySize());
+    //app1 ResourceUsage (0 reserved)
+    Assert.assertEquals(5*GB,
+        schedulerApp1
+            .getAppAttemptResourceUsage().getAllUsed().getMemorySize());
+    Assert.assertEquals(0,
+        schedulerApp1.getCurrentReservation().getMemorySize());
+    //app2  ResourceUsage (4GB reserved)
+    Assert.assertEquals(1*GB,
+        schedulerApp2
+            .getAppAttemptResourceUsage().getAllUsed().getMemorySize());
+    Assert.assertEquals(4*GB,
+        schedulerApp2.getCurrentReservation().getMemorySize());
+    //move app2 which has one reservedContainer
+    scheduler.moveApplication(app2.getApplicationId(), "b");
+    // keep this order
+    // if killing app1 first,the reservedContainer of app2 will be allocated
+    rm1.killApp(app2.getApplicationId());
+    rm1.killApp(app1.getApplicationId());
+    //after,moved app2 which has one reservedContainer
+    Assert.assertEquals(0, srcQueue.getNumContainers());
+    Assert.assertEquals(0, desQueue.getNumContainers());
+    Assert.assertEquals(0, srcQueue.getUsedResources().getMemorySize());
+    Assert.assertEquals(0, desQueue.getUsedResources().getMemorySize());
+    rm1.close();
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org