You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by jh...@apache.org on 2020/04/30 19:16:28 UTC

[hadoop] branch branch-2.10 updated: YARN-8193. YARN RM hangs abruptly (stops allocating resources) when running successive applications. (Zian Chen via wangda)

This is an automated email from the ASF dual-hosted git repository.

jhung pushed a commit to branch branch-2.10
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/branch-2.10 by this push:
     new 27ad054  YARN-8193. YARN RM hangs abruptly (stops allocating resources) when running successive applications. (Zian Chen via wangda)
27ad054 is described below

commit 27ad054696a93cc12d84cdcda88cd0cf6cd11151
Author: Jonathan Hung <jh...@linkedin.com>
AuthorDate: Thu Apr 30 12:16:15 2020 -0700

    YARN-8193. YARN RM hangs abruptly (stops allocating resources) when running successive applications. (Zian Chen via wangda)
---
 .../allocator/RegularContainerAllocator.java       | 42 ++++++++++++++++------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java
index a279e3e..7517012 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java
@@ -178,11 +178,22 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
     // This is to make sure non-partitioned-resource-request will prefer
     // to be allocated to non-partitioned nodes
     int missedNonPartitionedRequestSchedulingOpportunity = 0;
+    SchedulingPlacementSet<FiCaSchedulerNode> schedulingPS =
+        appInfo.getSchedulingPlacementSet(schedulerKey);
+    if (null == schedulingPS){
+      // This is possible when #pending resource decreased by a different
+      // thread.
+      ActivitiesLogger.APP.recordSkippedAppActivityWithoutAllocation(
+          activitiesManager, node, application, priority,
+          ActivityDiagnosticConstant.PRIORITY_SKIPPED_BECAUSE_NULL_ANY_REQUEST);
+      return ContainerAllocation.PRIORITY_SKIPPED;
+    }
+    String requestPartition =
+        schedulingPS.getPrimaryRequestedNodePartition();
+
     // Only do this when request associated with given scheduler key accepts
     // NO_LABEL under RESPECT_EXCLUSIVITY mode
-    if (StringUtils.equals(RMNodeLabelsManager.NO_LABEL,
-        appInfo.getSchedulingPlacementSet(schedulerKey)
-            .getPrimaryRequestedNodePartition())) {
+    if (StringUtils.equals(RMNodeLabelsManager.NO_LABEL, requestPartition)) {
       missedNonPartitionedRequestSchedulingOpportunity =
           application.addMissedNonPartitionedRequestSchedulingOpportunity(
               schedulerKey);
@@ -260,12 +271,9 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
     return result;
   }
   
-  public float getLocalityWaitFactor(
-      SchedulerRequestKey schedulerKey, int clusterNodes) {
+  public float getLocalityWaitFactor(int uniqAsks, int clusterNodes) {
     // Estimate: Required unique resources (i.e. hosts + racks)
-    int requiredResources = Math.max(
-        application.getSchedulingPlacementSet(schedulerKey)
-            .getUniqueLocationAsks() - 1, 0);
+    int requiredResources = Math.max(uniqAsks - 1, 0);
     
     // waitFactor can't be more than '1' 
     // i.e. no point skipping more than clustersize opportunities
@@ -295,10 +303,16 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
       if (rmContext.getScheduler().getNumClusterNodes() == 0) {
         return false;
       }
+
+      int uniqLocationAsks = 0;
+      SchedulingPlacementSet<FiCaSchedulerNode> schedulingPS =
+          appInfo.getSchedulingPlacementSet(schedulerKey);
+      if (schedulingPS != null) {
+        uniqLocationAsks = schedulingPS.getUniqueLocationAsks();
+      }
       // If we have only ANY requests for this schedulerKey, we should not
       // delay its scheduling.
-      if (application.getSchedulingPlacementSet(schedulerKey)
-          .getUniqueLocationAsks() == 1) {
+      if (uniqLocationAsks == 1) {
         return true;
       }
 
@@ -312,7 +326,7 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
       } else {
         long requiredContainers = application.getOutstandingAsksCount(
             schedulerKey);
-        float localityWaitFactor = getLocalityWaitFactor(schedulerKey,
+        float localityWaitFactor = getLocalityWaitFactor(uniqLocationAsks,
             rmContext.getScheduler().getNumClusterNodes());
         // Cap the delay by the number of nodes in the cluster.
         return (Math.min(rmContext.getScheduler().getNumClusterNodes(),
@@ -825,6 +839,12 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
         application.getAppSchedulingInfo().getSchedulingPlacementSet(
             schedulerKey);
 
+    // This could be null when #pending request decreased by another thread.
+    if (schedulingPS == null) {
+      return new ContainerAllocation(reservedContainer, null,
+          AllocationState.QUEUE_SKIPPED);
+    }
+
     result = ContainerAllocation.PRIORITY_SKIPPED;
 
     Iterator<FiCaSchedulerNode> iter = schedulingPS.getPreferredNodeIterator(


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org