You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by jh...@apache.org on 2020/04/30 19:16:28 UTC
[hadoop] branch branch-2.10 updated: YARN-8193. YARN RM hangs
abruptly (stops allocating resources) when running successive applications.
(Zian Chen via wangda)
This is an automated email from the ASF dual-hosted git repository.
jhung pushed a commit to branch branch-2.10
in repository https://gitbox.apache.org/repos/asf/hadoop.git
The following commit(s) were added to refs/heads/branch-2.10 by this push:
new 27ad054 YARN-8193. YARN RM hangs abruptly (stops allocating resources) when running successive applications. (Zian Chen via wangda)
27ad054 is described below
commit 27ad054696a93cc12d84cdcda88cd0cf6cd11151
Author: Jonathan Hung <jh...@linkedin.com>
AuthorDate: Thu Apr 30 12:16:15 2020 -0700
YARN-8193. YARN RM hangs abruptly (stops allocating resources) when running successive applications. (Zian Chen via wangda)
---
.../allocator/RegularContainerAllocator.java | 42 ++++++++++++++++------
1 file changed, 31 insertions(+), 11 deletions(-)
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java
index a279e3e..7517012 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java
@@ -178,11 +178,22 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
// This is to make sure non-partitioned-resource-request will prefer
// to be allocated to non-partitioned nodes
int missedNonPartitionedRequestSchedulingOpportunity = 0;
+ SchedulingPlacementSet<FiCaSchedulerNode> schedulingPS =
+ appInfo.getSchedulingPlacementSet(schedulerKey);
+ if (null == schedulingPS){
+ // This is possible when #pending resource decreased by a different
+ // thread.
+ ActivitiesLogger.APP.recordSkippedAppActivityWithoutAllocation(
+ activitiesManager, node, application, priority,
+ ActivityDiagnosticConstant.PRIORITY_SKIPPED_BECAUSE_NULL_ANY_REQUEST);
+ return ContainerAllocation.PRIORITY_SKIPPED;
+ }
+ String requestPartition =
+ schedulingPS.getPrimaryRequestedNodePartition();
+
// Only do this when request associated with given scheduler key accepts
// NO_LABEL under RESPECT_EXCLUSIVITY mode
- if (StringUtils.equals(RMNodeLabelsManager.NO_LABEL,
- appInfo.getSchedulingPlacementSet(schedulerKey)
- .getPrimaryRequestedNodePartition())) {
+ if (StringUtils.equals(RMNodeLabelsManager.NO_LABEL, requestPartition)) {
missedNonPartitionedRequestSchedulingOpportunity =
application.addMissedNonPartitionedRequestSchedulingOpportunity(
schedulerKey);
@@ -260,12 +271,9 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
return result;
}
- public float getLocalityWaitFactor(
- SchedulerRequestKey schedulerKey, int clusterNodes) {
+ public float getLocalityWaitFactor(int uniqAsks, int clusterNodes) {
// Estimate: Required unique resources (i.e. hosts + racks)
- int requiredResources = Math.max(
- application.getSchedulingPlacementSet(schedulerKey)
- .getUniqueLocationAsks() - 1, 0);
+ int requiredResources = Math.max(uniqAsks - 1, 0);
// waitFactor can't be more than '1'
// i.e. no point skipping more than clustersize opportunities
@@ -295,10 +303,16 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
if (rmContext.getScheduler().getNumClusterNodes() == 0) {
return false;
}
+
+ int uniqLocationAsks = 0;
+ SchedulingPlacementSet<FiCaSchedulerNode> schedulingPS =
+ appInfo.getSchedulingPlacementSet(schedulerKey);
+ if (schedulingPS != null) {
+ uniqLocationAsks = schedulingPS.getUniqueLocationAsks();
+ }
// If we have only ANY requests for this schedulerKey, we should not
// delay its scheduling.
- if (application.getSchedulingPlacementSet(schedulerKey)
- .getUniqueLocationAsks() == 1) {
+ if (uniqLocationAsks == 1) {
return true;
}
@@ -312,7 +326,7 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
} else {
long requiredContainers = application.getOutstandingAsksCount(
schedulerKey);
- float localityWaitFactor = getLocalityWaitFactor(schedulerKey,
+ float localityWaitFactor = getLocalityWaitFactor(uniqLocationAsks,
rmContext.getScheduler().getNumClusterNodes());
// Cap the delay by the number of nodes in the cluster.
return (Math.min(rmContext.getScheduler().getNumClusterNodes(),
@@ -825,6 +839,12 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
application.getAppSchedulingInfo().getSchedulingPlacementSet(
schedulerKey);
+ // This could be null when #pending request decreased by another thread.
+ if (schedulingPS == null) {
+ return new ContainerAllocation(reservedContainer, null,
+ AllocationState.QUEUE_SKIPPED);
+ }
+
result = ContainerAllocation.PRIORITY_SKIPPED;
Iterator<FiCaSchedulerNode> iter = schedulingPS.getPreferredNodeIterator(
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org