You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by eb...@apache.org on 2021/04/08 18:41:57 UTC
[hadoop] branch branch-3.1 updated: YARN-10702. Add cluster metric
for amount of CPU used by RM Event Processor. Contributed by Jim Brennan.
This is an automated email from the ASF dual-hosted git repository.
ebadger pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/hadoop.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new c70ac91 YARN-10702. Add cluster metric for amount of CPU used by RM Event Processor. Contributed by Jim Brennan.
c70ac91 is described below
commit c70ac91d3fb09474d50a8724b2973185111572fb
Author: Eric Badger <eb...@verizonmedia.com>
AuthorDate: Thu Apr 8 18:41:28 2021 +0000
YARN-10702. Add cluster metric for amount of CPU used by RM Event Processor.
Contributed by Jim Brennan.
---
.../apache/hadoop/yarn/conf/YarnConfiguration.java | 12 +++
.../apache/hadoop/yarn/event/EventDispatcher.java | 7 ++
.../src/main/resources/yarn-default.xml | 13 +++
.../server/resourcemanager/ClusterMetrics.java | 27 +++++
.../server/resourcemanager/ResourceManager.java | 111 ++++++++++++++++++++-
.../webapp/MetricsOverviewTable.java | 5 +-
.../webapp/dao/ClusterMetricsInfo.java | 15 ++-
.../resourcemanager/webapp/TestNodesPage.java | 2 +-
.../resourcemanager/webapp/TestRMWebServices.java | 2 +-
9 files changed, 187 insertions(+), 7 deletions(-)
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 47baa3d..737baee 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -2398,6 +2398,18 @@ public class YarnConfiguration extends Configuration {
public static final int
DEFAULT_YARN_DISPATCHER_PRINT_EVENTS_INFO_THRESHOLD = 5000;
+ /** Resource manager dispatcher thread monitor sampling rate.
+ * Units are samples per minute. This controls how often to sample
+ * the cpu utilization of the resource manager dispatcher thread.
+ * The cpu utilization is displayed on the RM UI as scheduler busy %.
+ * Set to zero to disable the dispatcher thread monitor.
+ */
+ public static final String
+ YARN_DISPATCHER_CPU_MONITOR_SAMPLES_PER_MIN =
+ YARN_PREFIX + "dispatcher.cpu-monitor.samples-per-min";
+ public static final int
+ DEFAULT_YARN_DISPATCHER_CPU_MONITOR_SAMPLES_PER_MIN = 60;
+
/**
* CLASSPATH for YARN applications. A comma-separated list of CLASSPATH
* entries
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/event/EventDispatcher.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/event/EventDispatcher.java
index 7c7a87b..7a1dff4 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/event/EventDispatcher.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/event/EventDispatcher.java
@@ -131,4 +131,11 @@ public class EventDispatcher<T extends Event> extends
public void disableExitOnError() {
shouldExitOnError = false;
}
+ protected long getEventProcessorId() {
+ return this.eventProcessor.getId();
+ }
+
+ protected boolean isStopped() {
+ return this.stopped;
+ }
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index e591433..3b0cb67 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -121,6 +121,19 @@
</property>
<property>
+ <description>
+ Resource manager dispatcher thread cpu monitor sampling rate.
+ Units are samples per minute. This controls how often to sample
+ the cpu utilization of the resource manager dispatcher thread.
+ The cpu utilization is displayed on the RM UI as scheduler busy %.
+ Set this to zero to disable the dispatcher thread monitor. Defaults
+ to 60 samples per minute.
+ </description>
+ <name>yarn.dispatcher.cpu-monitor.samples-per-min</name>
+ <value>60</value>
+ </property>
+
+ <property>
<description>The expiry interval for application master reporting.</description>
<name>yarn.am.liveness-monitor.expiry-interval-ms</name>
<value>600000</value>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java
index 764d807..0d74dc1 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ClusterMetrics.java
@@ -56,6 +56,12 @@ public class ClusterMetrics {
@Metric("Vcore Utilization") MutableGaugeLong utilizedVirtualCores;
@Metric("Memory Capability") MutableGaugeLong capabilityMB;
@Metric("Vcore Capability") MutableGaugeLong capabilityVirtualCores;
+ @Metric("RM Event Processor CPU Usage 60 second Avg") MutableGaugeLong
+ rmEventProcCPUAvg;
+ @Metric("RM Event Processor CPU Usage 60 second Max") MutableGaugeLong
+ rmEventProcCPUMax;
+
+ private boolean rmEventProcMonitorEnable = false;
private static final MetricsInfo RECORD_INFO = info("ClusterMetrics",
"Metrics for the Yarn Cluster");
@@ -91,6 +97,27 @@ public class ClusterMetrics {
INSTANCE = null;
}
+ // Indicate whether RM Event Thread CPU Monitor is enabled
+ public void setRmEventProcMonitorEnable(boolean value) {
+ rmEventProcMonitorEnable = value;
+ }
+ public boolean getRmEventProcMonitorEnable() {
+ return rmEventProcMonitorEnable;
+ }
+ // RM Event Processor CPU Usage
+ public long getRmEventProcCPUAvg() {
+ return rmEventProcCPUAvg.value();
+ }
+ public void setRmEventProcCPUAvg(long value) {
+ rmEventProcCPUAvg.set(value);
+ }
+ public long getRmEventProcCPUMax() {
+ return rmEventProcCPUMax.value();
+ }
+ public void setRmEventProcCPUMax(long value) {
+ rmEventProcCPUMax.set(value);
+ }
+
//Active Nodemanagers
public int getNumActiveNMs() {
return numActiveNMs.value();
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java
index 2fef798..ea4187b 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java
@@ -46,8 +46,9 @@ import org.apache.hadoop.util.JvmPauseMonitor;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.ShutdownHookManager;
import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.VersionInfo;
+import org.apache.hadoop.util.Time;
import org.apache.hadoop.util.curator.ZKCuratorManager;
+import org.apache.hadoop.util.VersionInfo;
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
@@ -125,6 +126,8 @@ import org.eclipse.jetty.webapp.WebAppContext;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
+import java.lang.management.ManagementFactory;
+import java.lang.management.ThreadMXBean;
import java.net.InetSocketAddress;
import java.net.URI;
import java.net.URL;
@@ -431,7 +434,21 @@ public class ResourceManager extends CompositeService
}
protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() {
- return new EventDispatcher(this.scheduler, "SchedulerEventDispatcher");
+ String dispatcherName = "SchedulerEventDispatcher";
+ EventDispatcher dispatcher;
+ int threadMonitorRate = conf.getInt(
+ YarnConfiguration.YARN_DISPATCHER_CPU_MONITOR_SAMPLES_PER_MIN,
+ YarnConfiguration.DEFAULT_YARN_DISPATCHER_CPU_MONITOR_SAMPLES_PER_MIN);
+
+ if (threadMonitorRate > 0) {
+ dispatcher = new SchedulerEventDispatcher(dispatcherName,
+ threadMonitorRate);
+ ClusterMetrics.getMetrics().setRmEventProcMonitorEnable(true);
+ } else {
+ dispatcher = new EventDispatcher(this.scheduler, dispatcherName);
+ }
+
+ return dispatcher;
}
protected Dispatcher createDispatcher() {
@@ -949,7 +966,95 @@ public class ResourceManager extends CompositeService
}
}
- /**
+ @Private
+ private class SchedulerEventDispatcher extends
+ EventDispatcher<SchedulerEvent> {
+
+ private final Thread eventProcessorMonitor;
+
+ SchedulerEventDispatcher(String name, int samplesPerMin) {
+ super(scheduler, name);
+ this.eventProcessorMonitor =
+ new Thread(new EventProcessorMonitor(getEventProcessorId(),
+ samplesPerMin));
+ this.eventProcessorMonitor
+ .setName("ResourceManager Event Processor Monitor");
+ }
+ // EventProcessorMonitor keeps track of how much CPU the EventProcessor
+ // thread is using. It takes a configurable number of samples per minute,
+ // and then reports the Avg and Max of previous 60 seconds as cluster
+ // metrics. Units are usecs per second of CPU used.
+ // Avg is not accurate until one minute of samples have been received.
+ private final class EventProcessorMonitor implements Runnable {
+ private final long tid;
+ private final boolean run;
+ private final ThreadMXBean tmxb;
+ private final ClusterMetrics clusterMetrics = ClusterMetrics.getMetrics();
+ private final int samples;
+ EventProcessorMonitor(long id, int samplesPerMin) {
+ assert samplesPerMin > 0;
+ this.tid = id;
+ this.samples = samplesPerMin;
+ this.tmxb = ManagementFactory.getThreadMXBean();
+ if (clusterMetrics != null &&
+ tmxb != null && tmxb.isThreadCpuTimeSupported()) {
+ this.run = true;
+ clusterMetrics.setRmEventProcMonitorEnable(true);
+ } else {
+ this.run = false;
+ }
+ }
+ public void run() {
+ int index = 0;
+ long[] values = new long[samples];
+ int sleepMs = (60 * 1000) / samples;
+
+ while (run && !isStopped() && !Thread.currentThread().isInterrupted()) {
+ try {
+ long cpuBefore = tmxb.getThreadCpuTime(tid);
+ long wallClockBefore = Time.monotonicNow();
+ Thread.sleep(sleepMs);
+ long wallClockDelta = Time.monotonicNow() - wallClockBefore;
+ long cpuDelta = tmxb.getThreadCpuTime(tid) - cpuBefore;
+
+ // Nanoseconds / Milliseconds = usec per second
+ values[index] = cpuDelta / wallClockDelta;
+
+ index = (index + 1) % samples;
+ long max = 0;
+ long sum = 0;
+ for (int i = 0; i < samples; i++) {
+ sum += values[i];
+ max = Math.max(max, values[i]);
+ }
+ clusterMetrics.setRmEventProcCPUAvg(sum / samples);
+ clusterMetrics.setRmEventProcCPUMax(max);
+ } catch (InterruptedException e) {
+ LOG.error("Returning, interrupted : " + e);
+ return;
+ }
+ }
+ }
+ }
+ @Override
+ protected void serviceStart() throws Exception {
+ super.serviceStart();
+ this.eventProcessorMonitor.start();
+ }
+
+ @Override
+ protected void serviceStop() throws Exception {
+ super.serviceStop();
+ this.eventProcessorMonitor.interrupt();
+ try {
+ this.eventProcessorMonitor.join();
+ } catch (InterruptedException e) {
+ throw new YarnRuntimeException(e);
+ }
+ }
+ }
+
+ /**
* Transition to standby state in a new thread. The transition operation is
* asynchronous to avoid deadlock caused by cyclic dependency.
*/
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/MetricsOverviewTable.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/MetricsOverviewTable.java
index 009a012..3ce4f2b 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/MetricsOverviewTable.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/MetricsOverviewTable.java
@@ -204,7 +204,8 @@ public class MetricsOverviewTable extends HtmlBlock {
}
SchedulerInfo schedulerInfo = new SchedulerInfo(this.rm);
-
+ int schedBusy = clusterMetrics.getRmSchedulerBusyPercent();
+
div.h3("Scheduler Metrics").
table("#schedulermetricsoverview").
thead().$class("ui-widget-header").
@@ -215,6 +216,7 @@ public class MetricsOverviewTable extends HtmlBlock {
th().$class("ui-state-default").__("Maximum Allocation").__().
th().$class("ui-state-default")
.__("Maximum Cluster Application Priority").__().
+ th().$class("ui-state-default").__("Scheduler Busy %").__().
__().
__().
tbody().$class("ui-widget-content").
@@ -225,6 +227,7 @@ public class MetricsOverviewTable extends HtmlBlock {
td(schedulerInfo.getMinAllocation().toString()).
td(schedulerInfo.getMaxAllocation().toString()).
td(String.valueOf(schedulerInfo.getMaxClusterLevelAppPriority())).
+ td(schedBusy == -1 ? UNAVAILABLE : String.valueOf(schedBusy)).
__().
__().__();
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ClusterMetricsInfo.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ClusterMetricsInfo.java
index d0371ac..762a4e0 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ClusterMetricsInfo.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ClusterMetricsInfo.java
@@ -55,6 +55,7 @@ public class ClusterMetricsInfo {
private long totalVirtualCores;
private int utilizedMBPercent;
private int utilizedVirtualCoresPercent;
+ private int rmSchedulerBusyPercent;
private int totalNodes;
private int lostNodes;
private int unhealthyNodes;
@@ -139,7 +140,11 @@ public class ClusterMetricsInfo {
this.utilizedVirtualCoresPercent = baseCores <= 0 ? 0 :
(int) (clusterMetrics.getUtilizedVirtualCores() * 100 /
baseCores);
-
+ // Scheduler Busy is in usec per sec, so to get percent divide by 10^4
+ // Set to -1 if disabled.
+ this.rmSchedulerBusyPercent =
+ clusterMetrics.getRmEventProcMonitorEnable() ?
+ (int)(clusterMetrics.getRmEventProcCPUAvg() / 10000L) : -1;
this.activeNodes = clusterMetrics.getNumActiveNMs();
this.lostNodes = clusterMetrics.getNumLostNMs();
this.unhealthyNodes = clusterMetrics.getUnhealthyNMs();
@@ -259,6 +264,10 @@ public class ClusterMetricsInfo {
return utilizedVirtualCoresPercent;
}
+ public int getRmSchedulerBusyPercent() {
+ return rmSchedulerBusyPercent;
+ }
+
public void setContainersReserved(int containersReserved) {
this.containersReserved = containersReserved;
}
@@ -371,6 +380,10 @@ public class ClusterMetricsInfo {
this.utilizedVirtualCoresPercent = utilizedVirtualCoresPercent;
}
+ public void setRmSchedulerBusyPercent(int rmSchedulerBusyPercent) {
+ this.rmSchedulerBusyPercent = rmSchedulerBusyPercent;
+ }
+
public ResourceInfo getTotalClusterResourcesAcrossPartition() {
return totalClusterResourcesAcrossPartition;
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestNodesPage.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestNodesPage.java
index eb2090a..3556c73 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestNodesPage.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestNodesPage.java
@@ -52,7 +52,7 @@ public class TestNodesPage {
// Number of Actual Table Headers for NodesPage.NodesBlock might change in
// future. In that case this value should be adjusted to the new value.
- private final int numberOfThInMetricsTable = 22;
+ private final int numberOfThInMetricsTable = 23;
private final int numberOfActualTableHeaders = 18;
private final int numberOfThForOpportunisticContainers = 4;
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServices.java
index 23d2c3a..1b83fc7 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServices.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServices.java
@@ -461,7 +461,7 @@ public class TestRMWebServices extends JerseyTestBase {
Exception {
assertEquals("incorrect number of elements", 1, json.length());
JSONObject clusterinfo = json.getJSONObject("clusterMetrics");
- assertEquals("incorrect number of elements", 29, clusterinfo.length());
+ assertEquals("incorrect number of elements", 30, clusterinfo.length());
verifyClusterMetrics(
clusterinfo.getInt("appsSubmitted"), clusterinfo.getInt("appsCompleted"),
clusterinfo.getInt("reservedMB"), clusterinfo.getInt("availableMB"),
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org