You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ep...@apache.org on 2020/08/10 21:12:27 UTC
[hadoop] branch branch-2.10 updated: YARN-8459. Improve Capacity
Scheduler logs to debug invalid states. Contributed by Wangda Tan and Jim
Brennan.
This is an automated email from the ASF dual-hosted git repository.
epayne pushed a commit to branch branch-2.10
in repository https://gitbox.apache.org/repos/asf/hadoop.git
The following commit(s) were added to refs/heads/branch-2.10 by this push:
new 21788f9 YARN-8459. Improve Capacity Scheduler logs to debug invalid states. Contributed by Wangda Tan and Jim Brennan.
21788f9 is described below
commit 21788f9fd4681091b26df2a4249329528ead836d
Author: Eric E Payne <er...@verizonmedia.com>
AuthorDate: Mon Aug 10 20:52:44 2020 +0000
YARN-8459. Improve Capacity Scheduler logs to debug invalid states. Contributed by Wangda Tan and Jim Brennan.
---
.../scheduler/capacity/CapacityScheduler.java | 35 ++++++++++++++-------
.../scheduler/capacity/ParentQueue.java | 36 ++++++++++++++--------
.../allocator/AbstractContainerAllocator.java | 13 +++++---
.../scheduler/common/fica/FiCaSchedulerApp.java | 5 +++
4 files changed, 61 insertions(+), 28 deletions(-)
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
index 426aa9c..7f41d8f 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
@@ -1254,10 +1254,17 @@ public class CapacityScheduler extends
*/
private CSAssignment allocateContainerOnSingleNode(PlacementSet<FiCaSchedulerNode> ps,
FiCaSchedulerNode node, boolean withNodeHeartbeat) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(
+ "Trying to schedule on node: " + node.getNodeName() + ", available: "
+ + node.getUnallocatedResource());
+ }
+
// Backward compatible way to make sure previous behavior which allocation
// driven by node heartbeat works.
if (getNode(node.getNodeID()) != node) {
- LOG.error("Trying to schedule on a removed node, please double check.");
+ LOG.error("Trying to schedule on a removed node, please double check, "
+ + "nodeId=" + node.getNodeID());
return null;
}
@@ -1271,14 +1278,19 @@ public class CapacityScheduler extends
FiCaSchedulerApp reservedApplication = getCurrentAttemptForContainer(
reservedContainer.getContainerId());
if (reservedApplication == null) {
- LOG.error("Trying to schedule for a finished app, please double check.");
+ LOG.error(
+ "Trying to schedule for a finished app, please double check. nodeId="
+ + node.getNodeID() + " container=" + reservedContainer
+ .getContainerId());
return null;
}
// Try to fulfill the reservation
- LOG.info(
- "Trying to fulfill reservation for application " + reservedApplication
- .getApplicationId() + " on node: " + node.getNodeID());
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Trying to fulfill reservation for application "
+ + reservedApplication.getApplicationId() + " on node: " + node
+ .getNodeID());
+ }
LeafQueue queue = ((LeafQueue) reservedApplication.getQueue());
assignment = queue.assignContainers(getClusterResource(), ps,
@@ -1342,12 +1354,6 @@ public class CapacityScheduler extends
return null;
}
- if (LOG.isDebugEnabled()) {
- LOG.debug(
- "Trying to schedule on node: " + node.getNodeName() + ", available: "
- + node.getUnallocatedResource());
- }
-
return allocateOrReserveNewContainers(ps, withNodeHeartbeat);
}
@@ -2578,6 +2584,7 @@ public class CapacityScheduler extends
LOG.debug("Try to commit allocation proposal=" + request);
}
+ boolean isSuccess = false;
if (attemptId != null) {
FiCaSchedulerApp app = getApplicationAttempt(attemptId);
// Required sanity check for attemptId - when async-scheduling enabled,
@@ -2589,6 +2596,7 @@ public class CapacityScheduler extends
CapacitySchedulerMetrics.getMetrics()
.addCommitSuccess(commitSuccess);
LOG.info("Allocation proposal accepted");
+ isSuccess = true;
} else{
long commitFailed = System.nanoTime() - commitStart;
CapacitySchedulerMetrics.getMetrics()
@@ -2596,6 +2604,11 @@ public class CapacityScheduler extends
LOG.info("Failed to accept allocation proposal");
}
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Allocation proposal accepted=" + isSuccess + ", proposal="
+ + request);
+ }
+
// Update unconfirmed allocated resource.
if (updateUnconfirmedAllocatedResource) {
app.decUnconfirmedRes(request.getTotalAllocatedResource());
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java
index 2de13cf..496bbf4 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java
@@ -85,6 +85,8 @@ public class ParentQueue extends AbstractCSQueue {
private QueueOrderingPolicy queueOrderingPolicy;
+ private long lastSkipQueueDebugLoggingTimestamp = -1;
+
public ParentQueue(CapacitySchedulerContext cs,
String queueName, CSQueue parent, CSQueue old) throws IOException {
super(cs, queueName, parent, old);
@@ -487,9 +489,14 @@ public class ParentQueue extends AbstractCSQueue {
if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY
&& !accessibleToPartition(ps.getPartition())) {
if (LOG.isDebugEnabled()) {
- LOG.debug("Skip this queue=" + getQueuePath()
- + ", because it is not able to access partition=" + ps
- .getPartition());
+ long now = System.currentTimeMillis();
+ // Do logging every 1 sec to avoid excessive logging.
+ if (now - this.lastSkipQueueDebugLoggingTimestamp > 1000) {
+ LOG.debug("Skip this queue=" + getQueuePath()
+ + ", because it is not able to access partition=" + ps
+ .getPartition());
+ this.lastSkipQueueDebugLoggingTimestamp = now;
+ }
}
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
@@ -509,10 +516,15 @@ public class ParentQueue extends AbstractCSQueue {
if (!super.hasPendingResourceRequest(ps.getPartition(), clusterResource,
schedulingMode)) {
if (LOG.isDebugEnabled()) {
- LOG.debug("Skip this queue=" + getQueuePath()
- + ", because it doesn't need more resource, schedulingMode="
- + schedulingMode.name() + " node-partition=" + ps
- .getPartition());
+ long now = System.currentTimeMillis();
+ // Do logging every 1 sec to avoid excessive logging.
+ if (now - this.lastSkipQueueDebugLoggingTimestamp > 1000) {
+ LOG.debug("Skip this queue=" + getQueuePath()
+ + ", because it doesn't need more resource, schedulingMode="
+ + schedulingMode.name() + " node-partition=" + ps
+ .getPartition());
+ this.lastSkipQueueDebugLoggingTimestamp = now;
+ }
}
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
@@ -613,12 +625,12 @@ public class ParentQueue extends AbstractCSQueue {
assignment.setIncreasedAllocation(
assignedToChild.isIncreasedAllocation());
- LOG.info("assignedContainer" + " queue=" + getQueueName()
- + " usedCapacity=" + getUsedCapacity() + " absoluteUsedCapacity="
- + getAbsoluteUsedCapacity() + " used=" + queueUsage.getUsed()
- + " cluster=" + clusterResource);
-
if (LOG.isDebugEnabled()) {
+ LOG.debug("assignedContainer reserved=" + isReserved + " queue="
+ + getQueueName() + " usedCapacity=" + getUsedCapacity()
+ + " absoluteUsedCapacity=" + getAbsoluteUsedCapacity() + " used="
+ + queueUsage.getUsed() + " cluster=" + clusterResource);
+
LOG.debug(
"ParentQ=" + getQueueName() + " assignedSoFarInThisIteration="
+ assignment.getResource() + " usedCapacity="
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/AbstractContainerAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/AbstractContainerAllocator.java
index d3ab01e..d9a1ce5 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/AbstractContainerAllocator.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/AbstractContainerAllocator.java
@@ -94,11 +94,14 @@ public abstract class AbstractContainerAllocator {
assignment.setType(result.getContainerNodeType());
if (result.getAllocationState() == AllocationState.RESERVED) {
- // This is a reserved container
- LOG.info("Reserved container " + " application="
- + application.getApplicationId() + " resource=" + allocatedResource
- + " queue=" + appInfo.getQueueName()
- + " cluster=" + clusterResource);
+ if (LOG.isDebugEnabled()) {
+ // This is a reserved container
+ // Since re-reservation could happen again and again for already
+ // reserved containers. only do this in debug log.
+ LOG.debug("Reserved container " + " application=" + application
+ .getApplicationId() + " resource=" + allocatedResource + " queue="
+ + appInfo.getQueueName() + " cluster=" + clusterResource);
+ }
assignment.getAssignmentInformation().addReservationDetails(
updatedContainer, application.getCSLeafQueue().getQueuePath());
assignment.getAssignmentInformation().incrReservations();
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java
index dc1a0a2..35e0e94 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java
@@ -591,6 +591,11 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
schedulerContainer.getRmContainer(),
schedulerContainer.getRmContainer().getContainer(),
reReservation);
+
+ LOG.info("Reserved container=" + rmContainer.getContainerId()
+ + ", on node=" + schedulerContainer.getSchedulerNode()
+ + " with resource=" + rmContainer
+ .getAllocatedOrReservedResource());
}
}
} finally {
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org