You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by jh...@apache.org on 2019/02/06 22:41:19 UTC
[hadoop] 01/01: YARN-9280. Backport YARN-6620 to YARN-8200/branch-2
for NodeManager-side GPU isolation
This is an automated email from the ASF dual-hosted git repository.
jhung pushed a commit to branch YARN-8200
in repository https://gitbox.apache.org/repos/asf/hadoop.git
commit 5c2b6809b2e7c2f072913bc6493a7e9a56931d41
Author: Jonathan Hung <jh...@linkedin.com>
AuthorDate: Wed Feb 6 14:41:03 2019 -0800
YARN-9280. Backport YARN-6620 to YARN-8200/branch-2 for NodeManager-side GPU isolation
---
.../yarn/api/records/ResourceInformation.java | 10 +
.../apache/hadoop/yarn/conf/YarnConfiguration.java | 33 ++
.../hadoop/yarn/util/resource/ResourceUtils.java | 51 +-
.../src/main/resources/yarn-default.xml | 39 ++
.../yarn/util/resource/TestResourceUtils.java | 17 +
.../yarn/server/nodemanager/ContainerExecutor.java | 3 +-
.../hadoop/yarn/server/nodemanager/Context.java | 3 +
.../nodemanager/DefaultContainerExecutor.java | 2 +-
.../nodemanager/DockerContainerExecutor.java | 2 +-
.../server/nodemanager/LinuxContainerExecutor.java | 10 +-
.../yarn/server/nodemanager/NodeManager.java | 92 ++--
.../server/nodemanager/NodeStatusUpdaterImpl.java | 38 +-
.../linux/privileged/PrivilegedOperation.java | 1 +
.../linux/resources/ResourceHandlerChain.java | 4 +-
.../linux/resources/ResourceHandlerModule.java | 42 +-
.../linux/resources/gpu/GpuResourceAllocator.java | 242 +++++++++
.../resources/gpu/GpuResourceHandlerImpl.java | 153 ++++++
.../resourceplugin/NodeResourceUpdaterPlugin.java | 52 ++
.../resourceplugin/ResourcePlugin.java | 83 ++++
.../resourceplugin/ResourcePluginManager.java | 106 ++++
.../resourceplugin/gpu/GpuDiscoverer.java | 254 ++++++++++
.../gpu/GpuNodeResourceUpdateHandler.java | 66 +++
.../resourceplugin/gpu/GpuResourcePlugin.java | 61 +++
.../webapp/dao/gpu/GpuDeviceInformation.java | 72 +++
.../webapp/dao/gpu/GpuDeviceInformationParser.java | 87 ++++
.../webapp/dao/gpu/PerGpuDeviceInformation.java | 165 +++++++
.../webapp/dao/gpu/PerGpuMemoryUsage.java | 58 +++
.../webapp/dao/gpu/PerGpuTemperature.java | 80 +++
.../webapp/dao/gpu/PerGpuUtilizations.java | 50 ++
.../server/nodemanager/NodeManagerTestBase.java | 164 ++++++
.../nodemanager/TestDefaultContainerExecutor.java | 4 +-
.../TestDockerContainerExecutorWithMocks.java | 2 +-
.../nodemanager/TestLinuxContainerExecutor.java | 2 +-
.../TestLinuxContainerExecutorWithMocks.java | 2 +-
.../yarn/server/nodemanager/TestNodeManager.java | 2 +-
.../server/nodemanager/TestNodeStatusUpdater.java | 100 +---
.../nodemanager/amrmproxy/BaseAMRMProxyTest.java | 46 +-
.../linux/resources/TestResourceHandlerModule.java | 8 +-
.../resources/gpu/TestGpuResourceHandler.java | 385 +++++++++++++++
.../TestContainersMonitorResourceChange.java | 2 +-
.../resourceplugin/TestResourcePluginManager.java | 261 ++++++++++
.../resourceplugin/gpu/TestGpuDiscoverer.java | 123 +++++
.../dao/gpu/TestGpuDeviceInformationParser.java | 50 ++
.../test/resources/nvidia-smi-sample-xml-output | 547 +++++++++++++++++++++
44 files changed, 3368 insertions(+), 206 deletions(-)
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java
index 0cc1e9c..8917a84 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java
@@ -18,10 +18,13 @@
package org.apache.hadoop.yarn.api.records;
+import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes;
import org.apache.hadoop.yarn.util.UnitsConversionUtil;
+import java.util.Map;
+
/**
* Class to encapsulate information about a Resource - the name of the resource,
* the units(milli, micro, etc), the type(countable), and the value.
@@ -35,13 +38,20 @@ public class ResourceInformation implements Comparable<ResourceInformation> {
private long minimumAllocation;
private long maximumAllocation;
+ // Known resource types
public static final String MEMORY_URI = "memory-mb";
public static final String VCORES_URI = "vcores";
+ public static final String GPU_URI = "yarn.io/gpu";
public static final ResourceInformation MEMORY_MB =
ResourceInformation.newInstance(MEMORY_URI, "Mi");
public static final ResourceInformation VCORES =
ResourceInformation.newInstance(VCORES_URI);
+ public static final ResourceInformation GPUS =
+ ResourceInformation.newInstance(GPU_URI);
+
+ public static final Map<String, ResourceInformation> MANDATORY_RESOURCES =
+ ImmutableMap.of(MEMORY_URI, MEMORY_MB, VCORES_URI, VCORES, GPU_URI, GPUS);
/**
* Get the name for the resource.
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 37f833a..b0ba075 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -1411,6 +1411,39 @@ public class YarnConfiguration extends Configuration {
public static final String NM_NETWORK_RESOURCE_OUTBOUND_BANDWIDTH_YARN_MBIT =
NM_NETWORK_RESOURCE_PREFIX + "outbound-bandwidth-yarn-mbit";
+ /**
+ * Prefix for computation resources, example of computation resources like
+ * GPU / FPGA / TPU, etc.
+ */
+ @Private
+ public static final String NM_RESOURCE_PLUGINS =
+ NM_PREFIX + "resource-plugins";
+
+ /**
+ * Prefix for gpu configurations. Work in progress: This configuration
+ * parameter may be changed/removed in the future.
+ */
+ @Private
+ public static final String NM_GPU_RESOURCE_PREFIX =
+ NM_RESOURCE_PLUGINS + ".gpu.";
+
+ @Private
+ public static final String NM_GPU_ALLOWED_DEVICES =
+ NM_GPU_RESOURCE_PREFIX + "allowed-gpu-devices";
+ @Private
+ public static final String AUTOMATICALLY_DISCOVER_GPU_DEVICES = "auto";
+
+ /**
+ * This setting controls where to how to invoke GPU binaries
+ */
+ @Private
+ public static final String NM_GPU_PATH_TO_EXEC =
+ NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables";
+
+ @Private
+ public static final String DEFAULT_NM_GPU_PATH_TO_EXEC = "";
+
+
/** NM Webapp address.**/
public static final String NM_WEBAPP_ADDRESS = NM_PREFIX + "webapp.address";
public static final int DEFAULT_NM_WEBAPP_PORT = 8042;
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java
index 1da5d6a..f3edc74 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java
@@ -46,6 +46,8 @@ import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
+import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
+
/**
* Helper class to read the resource-types to be supported by the system.
*/
@@ -82,33 +84,32 @@ public class ResourceUtils {
*/
String key = "memory";
if (resourceInformationMap.containsKey(key)) {
- LOG.warn("Attempt to define resource '" + key +
- "', but it is not allowed.");
- throw new YarnRuntimeException("Attempt to re-define mandatory resource '"
- + key + "'.");
+ LOG.warn(
+ "Attempt to define resource '" + key + "', but it is not allowed.");
+ throw new YarnRuntimeException(
+ "Attempt to re-define mandatory resource '" + key + "'.");
}
- if (resourceInformationMap.containsKey(MEMORY)) {
- ResourceInformation memInfo = resourceInformationMap.get(MEMORY);
- String memUnits = ResourceInformation.MEMORY_MB.getUnits();
- ResourceTypes memType = ResourceInformation.MEMORY_MB.getResourceType();
- if (!memInfo.getUnits().equals(memUnits) || !memInfo.getResourceType()
- .equals(memType)) {
- throw new YarnRuntimeException(
- "Attempt to re-define mandatory resource 'memory-mb'. It can only"
- + " be of type 'COUNTABLE' and have units 'Mi'.");
- }
- }
-
- if (resourceInformationMap.containsKey(VCORES)) {
- ResourceInformation vcoreInfo = resourceInformationMap.get(VCORES);
- String vcoreUnits = ResourceInformation.VCORES.getUnits();
- ResourceTypes vcoreType = ResourceInformation.VCORES.getResourceType();
- if (!vcoreInfo.getUnits().equals(vcoreUnits) || !vcoreInfo
- .getResourceType().equals(vcoreType)) {
- throw new YarnRuntimeException(
- "Attempt to re-define mandatory resource 'vcores'. It can only be"
- + " of type 'COUNTABLE' and have units ''(no units).");
+ for (Map.Entry<String, ResourceInformation> mandatoryResourceEntry :
+ ResourceInformation.MANDATORY_RESOURCES.entrySet()) {
+ key = mandatoryResourceEntry.getKey();
+ ResourceInformation mandatoryRI = mandatoryResourceEntry.getValue();
+
+ ResourceInformation newDefinedRI = resourceInformationMap.get(key);
+ if (newDefinedRI != null) {
+ String expectedUnit = mandatoryRI.getUnits();
+ ResourceTypes expectedType = mandatoryRI.getResourceType();
+ String actualUnit = newDefinedRI.getUnits();
+ ResourceTypes actualType = newDefinedRI.getResourceType();
+
+ if (!expectedUnit.equals(actualUnit) || !expectedType.equals(
+ actualType)) {
+ throw new YarnRuntimeException("Defined mandatory resource type="
+ + key + " inside resource-types.xml, however its type or "
+ + "unit is conflict to mandatory resource types, expected type="
+ + expectedType + ", unit=" + expectedUnit + "; actual type="
+ + actualType + " actual unit=" + actualUnit);
+ }
}
}
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 768deb2..5392b39 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -3456,6 +3456,45 @@
</property>
<property>
+ <description>
+ When yarn.nodemanager.resource.gpu.allowed-gpu-devices=auto specified,
+ YARN NodeManager needs to run GPU discovery binary (now only support
+ nvidia-smi) to get GPU-related information.
+ When value is empty (default), YARN NodeManager will try to locate
+ discovery executable itself.
+ An example of the config value is: /usr/local/bin/nvidia-smi
+ </description>
+ <name>yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables</name>
+ <value></value>
+ </property>
+
+ <property>
+ <description>
+ Enable additional discovery/isolation of resources on the NodeManager,
+ split by comma. By default, this is empty. Acceptable values: { "yarn-io/gpu" }.
+ </description>
+ <name>yarn.nodemanager.resource-plugins</name>
+ <value></value>
+ </property>
+
+ <property>
+ <description>
+ Specify GPU devices which can be managed by YARN NodeManager, split by comma
+ Number of GPU devices will be reported to RM to make scheduling decisions.
+ Set to auto (default) let YARN automatically discover GPU resource from
+ system.
+ Manually specify GPU devices if auto detect GPU device failed or admin
+ only want subset of GPU devices managed by YARN. GPU device is identified
+ by their minor device number. A common approach to get minor device number
+ of GPUs is using "nvidia-smi -q" and search "Minor Number" output. An
+ example of manual specification is "0,1,2,4" to allow YARN NodeManager
+ to manage GPU devices with minor number 0/1/2/4.
+ </description>
+ <name>yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices</name>
+ <value>auto</value>
+ </property>
+
+ <property>
<description>The http address of the timeline reader web application.</description>
<name>yarn.timeline-service.reader.webapp.address</name>
<value>${yarn.timeline-service.webapp.address}</value>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java
index d6bab92..80555ca 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java
@@ -52,6 +52,23 @@ public class TestResourceUtils {
}
}
+ public static void addNewTypesToResources(String... resourceTypes) {
+ // Initialize resource map
+ Map<String, ResourceInformation> riMap = new HashMap<>();
+
+ // Initialize mandatory resources
+ riMap.put(ResourceInformation.MEMORY_URI, ResourceInformation.MEMORY_MB);
+ riMap.put(ResourceInformation.VCORES_URI, ResourceInformation.VCORES);
+
+ for (String newResource : resourceTypes) {
+ riMap.put(newResource, ResourceInformation
+ .newInstance(newResource, "", 0, ResourceTypes.COUNTABLE, 0,
+ Integer.MAX_VALUE));
+ }
+
+ ResourceUtils.initializeResourcesFromResourceInformationMap(riMap);
+ }
+
@Before
public void setup() {
ResourceUtils.resetResourceTypes();
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
index 9454da4..e65b677 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
@@ -112,9 +112,10 @@ public abstract class ContainerExecutor implements Configurable {
* Run the executor initialization steps.
* Verify that the necessary configs and permissions are in place.
*
+ * @param nmContext Context of NM
* @throws IOException if initialization fails
*/
- public abstract void init() throws IOException;
+ public abstract void init(Context nmContext) throws IOException;
/**
* This function localizes the JAR file on-demand.
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java
index 33cefea..7e16034 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java
@@ -34,6 +34,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManag
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
@@ -122,4 +123,6 @@ public interface Context {
void setNMTimelinePublisher(NMTimelinePublisher nmMetricsPublisher);
NMTimelinePublisher getNMTimelinePublisher();
+
+ ResourcePluginManager getResourcePluginManager();
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java
index b54b7f5..e659c3e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java
@@ -134,7 +134,7 @@ public class DefaultContainerExecutor extends ContainerExecutor {
}
@Override
- public void init() throws IOException {
+ public void init(Context nmContext) throws IOException {
// nothing to do or verify here
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DockerContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DockerContainerExecutor.java
index a044cb6..6c2eb96 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DockerContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DockerContainerExecutor.java
@@ -117,7 +117,7 @@ public class DockerContainerExecutor extends ContainerExecutor {
}
@Override
- public void init() throws IOException {
+ public void init(Context nmContext) throws IOException {
String auth =
getConf().get(CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION);
if (auth != null && !auth.equals("simple")) {
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
index 765c49a..04f27c2 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
@@ -20,6 +20,7 @@ package org.apache.hadoop.yarn.server.nodemanager;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -281,7 +282,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
}
@Override
- public void init() throws IOException {
+ public void init(Context nmContext) throws IOException {
Configuration conf = super.getConf();
// Send command to executor which will just start up,
@@ -305,7 +306,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
try {
resourceHandlerChain = ResourceHandlerModule
- .getConfiguredResourceHandlerChain(conf);
+ .getConfiguredResourceHandlerChain(conf, nmContext);
if (LOG.isDebugEnabled()) {
LOG.debug("Resource handler chain enabled = " + (resourceHandlerChain
!= null));
@@ -845,4 +846,9 @@ public class LinuxContainerExecutor extends ContainerExecutor {
e);
}
}
+
+ @VisibleForTesting
+ public ResourceHandler getResourceHandler() {
+ return resourceHandlerChain;
+ }
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
index fcb5474..c74b54e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
@@ -18,23 +18,7 @@
package org.apache.hadoop.yarn.server.nodemanager;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.ConcurrentSkipListMap;
-import java.util.concurrent.atomic.AtomicBoolean;
-
-import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
-import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl;
-import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
-import org.apache.hadoop.yarn.state.MultiStateTransitionListener;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
+import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@@ -65,12 +49,16 @@ import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport;
import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
-import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
import org.apache.hadoop.yarn.server.nodemanager.collectormanager.NMCollectorService;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ConfigurationNodeLabelsProvider;
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider;
@@ -78,14 +66,25 @@ import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ScriptBasedNodeLabel
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
-import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM;
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer;
+import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
+import org.apache.hadoop.yarn.state.MultiStateTransitionListener;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
-import com.google.common.annotations.VisibleForTesting;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.atomic.AtomicBoolean;
public class NodeManager extends CompositeService
implements EventHandler<NodeManagerEvent> {
@@ -332,6 +331,18 @@ public class NodeManager extends CompositeService
nmCheckintervalTime, scriptTimeout, scriptArgs);
}
+ @VisibleForTesting
+ protected ResourcePluginManager createResourcePluginManager() {
+ return new ResourcePluginManager();
+ }
+
+ @VisibleForTesting
+ protected ContainerExecutor createContainerExecutor(Configuration conf) {
+ return ReflectionUtils.newInstance(
+ conf.getClass(YarnConfiguration.NM_CONTAINER_EXECUTOR,
+ DefaultContainerExecutor.class, ContainerExecutor.class), conf);
+ }
+
@Override
protected void serviceInit(Configuration conf) throws Exception {
@@ -360,11 +371,20 @@ public class NodeManager extends CompositeService
this.aclsManager = new ApplicationACLsManager(conf);
- ContainerExecutor exec = ReflectionUtils.newInstance(
- conf.getClass(YarnConfiguration.NM_CONTAINER_EXECUTOR,
- DefaultContainerExecutor.class, ContainerExecutor.class), conf);
+ boolean isDistSchedulingEnabled =
+ conf.getBoolean(YarnConfiguration.DIST_SCHEDULING_ENABLED,
+ YarnConfiguration.DEFAULT_DIST_SCHEDULING_ENABLED);
+
+ this.context = createNMContext(containerTokenSecretManager,
+ nmTokenSecretManager, nmStore, isDistSchedulingEnabled, conf);
+
+ ResourcePluginManager pluginManager = createResourcePluginManager();
+ pluginManager.initialize(context);
+ ((NMContext)context).setResourcePluginManager(pluginManager);
+
+ ContainerExecutor exec = createContainerExecutor(conf);
try {
- exec.init();
+ exec.init(context);
} catch (IOException e) {
throw new YarnRuntimeException("Failed to initialize container executor", e);
}
@@ -380,13 +400,6 @@ public class NodeManager extends CompositeService
getNodeHealthScriptRunner(conf), dirsHandler);
addService(nodeHealthChecker);
- boolean isDistSchedulingEnabled =
- conf.getBoolean(YarnConfiguration.DIST_SCHEDULING_ENABLED,
- YarnConfiguration.DEFAULT_DIST_SCHEDULING_ENABLED);
-
- this.context = createNMContext(containerTokenSecretManager,
- nmTokenSecretManager, nmStore, isDistSchedulingEnabled, conf);
-
((NMContext)context).setContainerExecutor(exec);
@@ -460,6 +473,12 @@ public class NodeManager extends CompositeService
try {
super.serviceStop();
DefaultMetricsSystem.shutdown();
+
+ // Cleanup ResourcePluginManager
+ ResourcePluginManager rpm = context.getResourcePluginManager();
+ if (rpm != null) {
+ rpm.cleanup();
+ }
} finally {
// YARN-3641: NM's services stop get failed shouldn't block the
// release of NMLevelDBStore.
@@ -607,6 +626,8 @@ public class NodeManager extends CompositeService
private NMTimelinePublisher nmTimelinePublisher;
+ private ResourcePluginManager resourcePluginManager;
+
public NMContext(NMContainerTokenSecretManager containerTokenSecretManager,
NMTokenSecretManagerInNM nmTokenSecretManager,
LocalDirsHandlerService dirsHandler, ApplicationACLsManager aclsManager,
@@ -807,6 +828,15 @@ public class NodeManager extends CompositeService
public NMTimelinePublisher getNMTimelinePublisher() {
return nmTimelinePublisher;
}
+
+ public ResourcePluginManager getResourcePluginManager() {
+ return resourcePluginManager;
+ }
+
+ public void setResourcePluginManager(
+ ResourcePluginManager resourcePluginManager) {
+ this.resourcePluginManager = resourcePluginManager;
+ }
}
/**
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java
index 888ee85..d776bdf 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java
@@ -33,6 +33,9 @@ import java.util.Map.Entry;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
+
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -178,14 +181,15 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
long memoryMb = totalResource.getMemorySize();
float vMemToPMem =
conf.getFloat(
- YarnConfiguration.NM_VMEM_PMEM_RATIO,
- YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO);
+ YarnConfiguration.NM_VMEM_PMEM_RATIO,
+ YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO);
long virtualMemoryMb = (long)Math.ceil(memoryMb * vMemToPMem);
-
int virtualCores = totalResource.getVirtualCores();
- LOG.info("Nodemanager resources: memory set to " + memoryMb + "MB.");
- LOG.info("Nodemanager resources: vcores set to " + virtualCores + ".");
- LOG.info("Nodemanager resources: " + totalResource);
+
+ // Update configured resources via plugins.
+ updateConfiguredResourcesViaPlugins(totalResource);
+
+ LOG.info("Nodemanager resources is set to: " + totalResource);
metrics.addResource(totalResource);
@@ -342,12 +346,27 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
return ServerRMProxy.createRMProxy(conf, ResourceTracker.class);
}
+ private void updateConfiguredResourcesViaPlugins(
+ Resource configuredResource) throws YarnException {
+ ResourcePluginManager pluginManager = context.getResourcePluginManager();
+ if (pluginManager != null && pluginManager.getNameToPlugins() != null) {
+ // Update configured resource
+ for (ResourcePlugin resourcePlugin : pluginManager.getNameToPlugins()
+ .values()) {
+ if (resourcePlugin.getNodeResourceHandlerInstance() != null) {
+ resourcePlugin.getNodeResourceHandlerInstance()
+ .updateConfiguredResource(configuredResource);
+ }
+ }
+ }
+ }
+
@VisibleForTesting
protected void registerWithRM()
throws YarnException, IOException {
RegisterNodeManagerResponse regNMResponse;
Set<NodeLabel> nodeLabels = nodeLabelsHandler.getNodeLabelsForRegistration();
-
+
// Synchronize NM-RM registration with
// ContainerManagerImpl#increaseContainersResource and
// ContainerManagerImpl#startContainers to avoid race condition
@@ -358,6 +377,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
RegisterNodeManagerRequest.newInstance(nodeId, httpPort, totalResource,
nodeManagerVersionId, containerReports, getRunningApplications(),
nodeLabels, physicalResource);
+
if (containerReports != null) {
LOG.info("Registering with RM using containers :" + containerReports);
}
@@ -406,7 +426,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
if (masterKey != null) {
this.context.getContainerTokenSecretManager().setMasterKey(masterKey);
}
-
+
masterKey = regNMResponse.getNMTokenMasterKey();
if (masterKey != null) {
this.context.getNMTokenSecretManager().setMasterKey(masterKey);
@@ -733,7 +753,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
}
}
}
-
+
@Override
public long getRMIdentifier() {
return this.rmIdentifier;
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java
index 8402a16..db0b225 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java
@@ -51,6 +51,7 @@ public class PrivilegedOperation {
TC_READ_STATS("--tc-read-stats"),
ADD_PID_TO_CGROUP(""), //no CLI switch supported yet.
RUN_DOCKER_CMD("--run-docker"),
+ GPU("--module-gpu"),
LIST_AS_USER(""); //no CLI switch supported yet.
private final String option;
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java
index 955d216..72bf30c 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java
@@ -20,6 +20,7 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;
+import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
@@ -135,7 +136,8 @@ public class ResourceHandlerChain implements ResourceHandler {
return allOperations;
}
- List<ResourceHandler> getResourceHandlerList() {
+ @VisibleForTesting
+ public List<ResourceHandler> getResourceHandlerList() {
return Collections.unmodifiableList(resourceHandlers);
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java
index 3c61cd4..ce850ab 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java
@@ -21,25 +21,28 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;
import com.google.common.annotations.VisibleForTesting;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.util.CgroupsLCEResourcesHandler;
import org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
-import java.util.Set;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.HashMap;
-import java.util.Arrays;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
+import java.util.Map;
+import java.util.Set;
/**
* Provides mechanisms to get various resource handlers - cpu, memory, network,
@@ -206,22 +209,41 @@ public class ResourceHandlerModule {
}
private static void initializeConfiguredResourceHandlerChain(
- Configuration conf) throws ResourceHandlerException {
+ Configuration conf, Context nmContext)
+ throws ResourceHandlerException {
ArrayList<ResourceHandler> handlerList = new ArrayList<>();
addHandlerIfNotNull(handlerList, getOutboundBandwidthResourceHandler(conf));
addHandlerIfNotNull(handlerList, getDiskResourceHandler(conf));
addHandlerIfNotNull(handlerList, getMemoryResourceHandler(conf));
addHandlerIfNotNull(handlerList, getCGroupsCpuResourceHandler(conf));
+ addHandlersFromConfiguredResourcePlugins(handlerList, conf, nmContext);
resourceHandlerChain = new ResourceHandlerChain(handlerList);
}
+ private static void addHandlersFromConfiguredResourcePlugins(
+ List<ResourceHandler> handlerList, Configuration conf,
+ Context nmContext) throws ResourceHandlerException {
+ ResourcePluginManager pluginManager = nmContext.getResourcePluginManager();
+ if (pluginManager != null) {
+ Map<String, ResourcePlugin> pluginMap = pluginManager.getNameToPlugins();
+ if (pluginMap != null) {
+ for (ResourcePlugin plugin : pluginMap.values()) {
+ addHandlerIfNotNull(handlerList, plugin
+ .createResourceHandler(nmContext,
+ getInitializedCGroupsHandler(conf),
+ PrivilegedOperationExecutor.getInstance(conf)));
+ }
+ }
+ }
+ }
+
public static ResourceHandlerChain getConfiguredResourceHandlerChain(
- Configuration conf) throws ResourceHandlerException {
+ Configuration conf, Context nmContext) throws ResourceHandlerException {
if (resourceHandlerChain == null) {
synchronized (ResourceHandlerModule.class) {
if (resourceHandlerChain == null) {
- initializeConfiguredResourceHandlerChain(conf);
+ initializeConfiguredResourceHandlerChain(conf, nmContext);
}
}
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
new file mode 100644
index 0000000..d6bae09
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
@@ -0,0 +1,242 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Sets;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
+import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
+
+/**
+ * Allocate GPU resources according to requirements
+ */
+public class GpuResourceAllocator {
+ final static Log LOG = LogFactory.getLog(GpuResourceAllocator.class);
+
+ private Set<Integer> allowedGpuDevices = new TreeSet<>();
+ private Map<Integer, ContainerId> usedDevices = new TreeMap<>();
+ private Context nmContext;
+
+ public GpuResourceAllocator(Context ctx) {
+ this.nmContext = ctx;
+ }
+
+ /**
+ * Contains allowed and denied devices with minor number.
+ * Denied devices will be useful for cgroups devices module to do blacklisting
+ */
+ static class GpuAllocation {
+ private Set<Integer> allowed = Collections.emptySet();
+ private Set<Integer> denied = Collections.emptySet();
+
+ GpuAllocation(Set<Integer> allowed, Set<Integer> denied) {
+ if (allowed != null) {
+ this.allowed = ImmutableSet.copyOf(allowed);
+ }
+ if (denied != null) {
+ this.denied = ImmutableSet.copyOf(denied);
+ }
+ }
+
+ public Set<Integer> getAllowedGPUs() {
+ return allowed;
+ }
+
+ public Set<Integer> getDeniedGPUs() {
+ return denied;
+ }
+ }
+
+ /**
+ * Add GPU to allowed list
+ * @param minorNumber minor number of the GPU device.
+ */
+ public synchronized void addGpu(int minorNumber) {
+ allowedGpuDevices.add(minorNumber);
+ }
+
+ private String getResourceHandlerExceptionMessage(int numRequestedGpuDevices,
+ ContainerId containerId) {
+ return "Failed to find enough GPUs, requestor=" + containerId
+ + ", #RequestedGPUs=" + numRequestedGpuDevices + ", #availableGpus="
+ + getAvailableGpus();
+ }
+
+ @VisibleForTesting
+ public synchronized int getAvailableGpus() {
+ return allowedGpuDevices.size() - usedDevices.size();
+ }
+
+ public synchronized void recoverAssignedGpus(ContainerId containerId)
+ throws ResourceHandlerException {
+ Container c = nmContext.getContainers().get(containerId);
+ if (null == c) {
+ throw new ResourceHandlerException(
+ "This shouldn't happen, cannot find container with id="
+ + containerId);
+ }
+
+ for (Serializable deviceId : c.getResourceMappings().getAssignedResources(
+ GPU_URI)){
+ if (!(deviceId instanceof String)) {
+ throw new ResourceHandlerException(
+ "Trying to recover device id, however it"
+ + " is not String, this shouldn't happen");
+ }
+
+
+ int devId;
+ try {
+ devId = Integer.parseInt((String)deviceId);
+ } catch (NumberFormatException e) {
+ throw new ResourceHandlerException("Failed to recover device id because"
+ + "it is not a valid integer, devId:" + deviceId);
+ }
+
+ // Make sure it is in allowed GPU device.
+ if (!allowedGpuDevices.contains(devId)) {
+ throw new ResourceHandlerException("Try to recover device id = " + devId
+ + " however it is not in allowed device list:" + StringUtils
+ .join(",", allowedGpuDevices));
+ }
+
+ // Make sure it is not occupied by anybody else
+ if (usedDevices.containsKey(devId)) {
+ throw new ResourceHandlerException("Try to recover device id = " + devId
+ + " however it is already assigned to container=" + usedDevices
+ .get(devId) + ", please double check what happened.");
+ }
+
+ usedDevices.put(devId, containerId);
+ }
+ }
+
+ private int getRequestedGpus(Resource requestedResource) {
+ try {
+ return Long.valueOf(requestedResource.getResourceValue(
+ GPU_URI)).intValue();
+ } catch (ResourceNotFoundException e) {
+ return 0;
+ }
+ }
+
+ /**
+ * Assign GPU to requestor
+ * @param container container to allocate
+ * @return List of denied Gpus with minor numbers
+ * @throws ResourceHandlerException When failed to
+ */
+ public synchronized GpuAllocation assignGpus(Container container)
+ throws ResourceHandlerException {
+ Resource requestedResource = container.getResource();
+ ContainerId containerId = container.getContainerId();
+ int numRequestedGpuDevices = getRequestedGpus(requestedResource);
+ // Assign Gpus to container if requested some.
+ if (numRequestedGpuDevices > 0) {
+ if (numRequestedGpuDevices > getAvailableGpus()) {
+ throw new ResourceHandlerException(
+ getResourceHandlerExceptionMessage(numRequestedGpuDevices,
+ containerId));
+ }
+
+ Set<Integer> assignedGpus = new HashSet<>();
+
+ for (int deviceNum : allowedGpuDevices) {
+ if (!usedDevices.containsKey(deviceNum)) {
+ usedDevices.put(deviceNum, containerId);
+ assignedGpus.add(deviceNum);
+ if (assignedGpus.size() == numRequestedGpuDevices) {
+ break;
+ }
+ }
+ }
+
+ // Record in state store if we allocated anything
+ if (!assignedGpus.isEmpty()) {
+ List<Serializable> allocatedDevices = new ArrayList<>();
+ for (int gpu : assignedGpus) {
+ allocatedDevices.add(String.valueOf(gpu));
+ }
+ try {
+ // Update Container#getResourceMapping.
+ ResourceMappings.AssignedResources assignedResources =
+ new ResourceMappings.AssignedResources();
+ assignedResources.updateAssignedResources(allocatedDevices);
+ container.getResourceMappings().addAssignedResources(GPU_URI,
+ assignedResources);
+
+ // Update state store.
+ nmContext.getNMStateStore().storeAssignedResources(containerId,
+ GPU_URI, allocatedDevices);
+ } catch (IOException e) {
+ cleanupAssignGpus(containerId);
+ throw new ResourceHandlerException(e);
+ }
+ }
+
+ return new GpuAllocation(assignedGpus,
+ Sets.difference(allowedGpuDevices, assignedGpus));
+ }
+ return new GpuAllocation(null, allowedGpuDevices);
+ }
+
+ /**
+ * Clean up all Gpus assigned to containerId
+ * @param containerId containerId
+ */
+ public synchronized void cleanupAssignGpus(ContainerId containerId) {
+ Iterator<Map.Entry<Integer, ContainerId>> iter =
+ usedDevices.entrySet().iterator();
+ while (iter.hasNext()) {
+ if (iter.next().getValue().equals(containerId)) {
+ iter.remove();
+ }
+ }
+ }
+
+ @VisibleForTesting
+ public synchronized Map<Integer, ContainerId> getDeviceAllocationMapping() {
+ return new HashMap<>(usedDevices);
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
new file mode 100644
index 0000000..7144bb2
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
@@ -0,0 +1,153 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
+import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class GpuResourceHandlerImpl implements ResourceHandler {
+ final static Log LOG = LogFactory
+ .getLog(GpuResourceHandlerImpl.class);
+
+ // This will be used by container-executor to add necessary clis
+ public static final String EXCLUDED_GPUS_CLI_OPTION = "--excluded_gpus";
+ public static final String CONTAINER_ID_CLI_OPTION = "--container_id";
+
+ private GpuResourceAllocator gpuAllocator;
+ private CGroupsHandler cGroupsHandler;
+ private PrivilegedOperationExecutor privilegedOperationExecutor;
+
+ public GpuResourceHandlerImpl(Context nmContext,
+ CGroupsHandler cGroupsHandler,
+ PrivilegedOperationExecutor privilegedOperationExecutor) {
+ this.cGroupsHandler = cGroupsHandler;
+ this.privilegedOperationExecutor = privilegedOperationExecutor;
+ gpuAllocator = new GpuResourceAllocator(nmContext);
+ }
+
+ @Override
+ public List<PrivilegedOperation> bootstrap(Configuration configuration)
+ throws ResourceHandlerException {
+ List<Integer> minorNumbersOfUsableGpus;
+ try {
+ minorNumbersOfUsableGpus = GpuDiscoverer.getInstance()
+ .getMinorNumbersOfGpusUsableByYarn();
+ } catch (YarnException e) {
+ LOG.error("Exception when trying to get usable GPU device", e);
+ throw new ResourceHandlerException(e);
+ }
+
+ for (int minorNumber : minorNumbersOfUsableGpus) {
+ gpuAllocator.addGpu(minorNumber);
+ }
+
+ // And initialize cgroups
+ this.cGroupsHandler.initializeCGroupController(
+ CGroupsHandler.CGroupController.DEVICES);
+
+ return null;
+ }
+
+ @Override
+ public synchronized List<PrivilegedOperation> preStart(Container container)
+ throws ResourceHandlerException {
+ String containerIdStr = container.getContainerId().toString();
+
+ // Assign Gpus to container if requested some.
+ GpuResourceAllocator.GpuAllocation allocation = gpuAllocator.assignGpus(
+ container);
+
+ // Create device cgroups for the container
+ cGroupsHandler.createCGroup(CGroupsHandler.CGroupController.DEVICES,
+ containerIdStr);
+ try {
+ // Execute c-e to setup GPU isolation before launch the container
+ PrivilegedOperation privilegedOperation = new PrivilegedOperation(
+ PrivilegedOperation.OperationType.GPU, Arrays
+ .asList(CONTAINER_ID_CLI_OPTION, containerIdStr));
+ if (!allocation.getDeniedGPUs().isEmpty()) {
+ privilegedOperation.appendArgs(Arrays.asList(EXCLUDED_GPUS_CLI_OPTION,
+ StringUtils.join(",", allocation.getDeniedGPUs())));
+ }
+
+ privilegedOperationExecutor.executePrivilegedOperation(
+ privilegedOperation, true);
+ } catch (PrivilegedOperationException e) {
+ cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES,
+ containerIdStr);
+ LOG.warn("Could not update cgroup for container", e);
+ throw new ResourceHandlerException(e);
+ }
+
+ List<PrivilegedOperation> ret = new ArrayList<>();
+ ret.add(new PrivilegedOperation(
+ PrivilegedOperation.OperationType.ADD_PID_TO_CGROUP,
+ PrivilegedOperation.CGROUP_ARG_PREFIX
+ + cGroupsHandler.getPathForCGroupTasks(
+ CGroupsHandler.CGroupController.DEVICES, containerIdStr)));
+
+ return ret;
+ }
+
+ @VisibleForTesting
+ public GpuResourceAllocator getGpuAllocator() {
+ return gpuAllocator;
+ }
+
+ @Override
+ public List<PrivilegedOperation> reacquireContainer(ContainerId containerId)
+ throws ResourceHandlerException {
+ gpuAllocator.recoverAssignedGpus(containerId);
+ return null;
+ }
+
+ @Override
+ public synchronized List<PrivilegedOperation> postComplete(
+ ContainerId containerId) throws ResourceHandlerException {
+ gpuAllocator.cleanupAssignGpus(containerId);
+ cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES,
+ containerId.toString());
+ return null;
+ }
+
+ @Override
+ public List<PrivilegedOperation> teardown() throws ResourceHandlerException {
+ return null;
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/NodeResourceUpdaterPlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/NodeResourceUpdaterPlugin.java
new file mode 100644
index 0000000..88f77ed
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/NodeResourceUpdaterPlugin.java
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
+
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+
+/**
+ * Plugins to handle resources on a node. This will be used by
+ * {@link org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater}
+ */
+public abstract class NodeResourceUpdaterPlugin {
+ /**
+ * Update configured resource for the given component.
+ * @param res resource passed in by external mododule (such as
+ * {@link org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater}
+ * @throws YarnException when any issue happens.
+ */
+ public abstract void updateConfiguredResource(Resource res)
+ throws YarnException;
+
+ /**
+ * This method will be called when the node's resource is loaded from
+ * dynamic-resources.xml in ResourceManager.
+ *
+ * @param newResource newResource reported by RM
+ * @throws YarnException when any mismatch between NM/RM
+ */
+ public void handleUpdatedResourceFromRM(Resource newResource) throws
+ YarnException {
+ // by default do nothing, subclass should implement this method when any
+ // special activities required upon new resource reported by RM.
+ }
+
+ // TODO: add implementation to update node attribute once YARN-3409 merged.
+}
\ No newline at end of file
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java
new file mode 100644
index 0000000..6e134b3
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
+
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain;
+
+/**
+ * {@link ResourcePlugin} is an interface for node manager to easier support
+ * discovery/manage/isolation for new resource types.
+ *
+ * <p>
+ * It has two major part: {@link ResourcePlugin#createResourceHandler(Context,
+ * CGroupsHandler, PrivilegedOperationExecutor)} and
+ * {@link ResourcePlugin#getNodeResourceHandlerInstance()}, see javadocs below
+ * for more details.
+ * </p>
+ */
+public interface ResourcePlugin {
+ /**
+ * Initialize the plugin, this will be invoked during NM startup.
+ * @param context NM Context
+ * @throws YarnException when any issue occurs
+ */
+ void initialize(Context context) throws YarnException;
+
+ /**
+ * Plugin needs to return {@link ResourceHandler} when any special isolation
+ * required for the resource type. This will be added to
+ * {@link ResourceHandlerChain} during NodeManager startup. When no special
+ * isolation need, return null.
+ *
+ * @param nmContext NodeManager context.
+ * @param cGroupsHandler CGroupsHandler
+ * @param privilegedOperationExecutor Privileged Operation Executor.
+ * @return ResourceHandler
+ */
+ ResourceHandler createResourceHandler(Context nmContext,
+ CGroupsHandler cGroupsHandler,
+ PrivilegedOperationExecutor privilegedOperationExecutor);
+
+ /**
+ * Plugin needs to return {@link NodeResourceUpdaterPlugin} when any discovery
+ * mechanism required for the resource type. For example, if we want to set
+ * resource-value during NM registration or send update during NM-RM heartbeat
+ * We can implement a {@link NodeResourceUpdaterPlugin} and update fields of
+ * {@link org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest}
+ * or {@link org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest}
+ *
+ * This will be invoked during every node status update or node registration,
+ * please avoid creating new instance every time.
+ *
+ * @return NodeResourceUpdaterPlugin, could be null when no discovery needed.
+ */
+ NodeResourceUpdaterPlugin getNodeResourceHandlerInstance();
+
+ /**
+ * Do cleanup of the plugin, this will be invoked when
+ * {@link org.apache.hadoop.yarn.server.nodemanager.NodeManager} stops
+ * @throws YarnException if any issue occurs
+ */
+ void cleanup() throws YarnException;
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java
new file mode 100644
index 0000000..73d6038
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
+
+import com.google.common.collect.ImmutableSet;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
+
+/**
+ * Manages {@link ResourcePlugin} configured on this NodeManager.
+ */
+public class ResourcePluginManager {
+ private static final Logger LOG =
+ LoggerFactory.getLogger(ResourcePluginManager.class);
+ private static final Set<String> SUPPORTED_RESOURCE_PLUGINS = ImmutableSet.of(
+ GPU_URI);
+
+ private Map<String, ResourcePlugin> configuredPlugins = Collections.EMPTY_MAP;
+
+ public synchronized void initialize(Context context)
+ throws YarnException {
+ Configuration conf = context.getConf();
+ String[] plugins = conf.getStrings(YarnConfiguration.NM_RESOURCE_PLUGINS);
+
+ if (plugins != null) {
+ Map<String, ResourcePlugin> pluginMap = new HashMap<>();
+
+ // Initialize each plugins
+ for (String resourceName : plugins) {
+ resourceName = resourceName.trim();
+ if (!SUPPORTED_RESOURCE_PLUGINS.contains(resourceName)) {
+ String msg =
+ "Trying to initialize resource plugin with name=" + resourceName
+ + ", it is not supported, list of supported plugins:"
+ + StringUtils.join(",",
+ SUPPORTED_RESOURCE_PLUGINS);
+ LOG.error(msg);
+ throw new YarnException(msg);
+ }
+
+ if (pluginMap.containsKey(resourceName)) {
+ // Duplicated items, ignore ...
+ continue;
+ }
+
+ ResourcePlugin plugin = null;
+ if (resourceName.equals(GPU_URI)) {
+ plugin = new GpuResourcePlugin();
+ }
+
+ if (plugin == null) {
+ throw new YarnException(
+ "This shouldn't happen, plugin=" + resourceName
+ + " should be loaded and initialized");
+ }
+ plugin.initialize(context);
+ pluginMap.put(resourceName, plugin);
+ }
+
+ configuredPlugins = Collections.unmodifiableMap(pluginMap);
+ }
+ }
+
+ public synchronized void cleanup() throws YarnException {
+ for (ResourcePlugin plugin : configuredPlugins.values()) {
+ plugin.cleanup();
+ }
+ }
+
+ /**
+ * Get resource name (such as gpu/fpga) to plugin references.
+ * @return read-only map of resource name to plugins.
+ */
+ public synchronized Map<String, ResourcePlugin> getNameToPlugins() {
+ return configuredPlugins;
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
new file mode 100644
index 0000000..61b8ce5
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
@@ -0,0 +1,254 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSet;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.Shell;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public class GpuDiscoverer {
+ public static final Logger LOG = LoggerFactory.getLogger(
+ GpuDiscoverer.class);
+ @VisibleForTesting
+ protected static final String DEFAULT_BINARY_NAME = "nvidia-smi";
+
+ // When executable path not set, try to search default dirs
+ // By default search /usr/bin, /bin, and /usr/local/nvidia/bin (when
+ // launched by nvidia-docker.
+ private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
+ "/usr/bin", "/bin", "/usr/local/nvidia/bin");
+
+ // command should not run more than 10 sec.
+ private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
+ private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
+ private static GpuDiscoverer instance;
+
+ static {
+ instance = new GpuDiscoverer();
+ }
+
+ private Configuration conf = null;
+ private String pathOfGpuBinary = null;
+ private Map<String, String> environment = new HashMap<>();
+ private GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
+
+ private int numOfErrorExecutionSinceLastSucceed = 0;
+ GpuDeviceInformation lastDiscoveredGpuInformation = null;
+
+ private void validateConfOrThrowException() throws YarnException {
+ if (conf == null) {
+ throw new YarnException("Please initialize (call initialize) before use "
+ + GpuDiscoverer.class.getSimpleName());
+ }
+ }
+
+ /**
+ * Get GPU device information from system.
+ * This need to be called after initialize.
+ *
+ * Please note that this only works on *NIX platform, so external caller
+ * need to make sure this.
+ *
+ * @return GpuDeviceInformation
+ * @throws YarnException when any error happens
+ */
+ public synchronized GpuDeviceInformation getGpuDeviceInformation()
+ throws YarnException {
+ validateConfOrThrowException();
+
+ if (null == pathOfGpuBinary) {
+ throw new YarnException(
+ "Failed to find GPU discovery executable, please double check "
+ + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting.");
+ }
+
+ if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
+ String msg =
+ "Failed to execute GPU device information detection script for "
+ + MAX_REPEATED_ERROR_ALLOWED
+ + " times, skip following executions.";
+ LOG.error(msg);
+ throw new YarnException(msg);
+ }
+
+ String output;
+ try {
+ output = Shell.execCommand(environment,
+ new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS);
+ GpuDeviceInformation info = parser.parseXml(output);
+ numOfErrorExecutionSinceLastSucceed = 0;
+ lastDiscoveredGpuInformation = info;
+ return info;
+ } catch (IOException e) {
+ numOfErrorExecutionSinceLastSucceed++;
+ String msg =
+ "Failed to execute " + pathOfGpuBinary + " exception message:" + e
+ .getMessage() + ", continue ...";
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(msg);
+ }
+ throw new YarnException(e);
+ } catch (YarnException e) {
+ numOfErrorExecutionSinceLastSucceed++;
+ String msg = "Failed to parse xml output" + e.getMessage();
+ if (LOG.isDebugEnabled()) {
+ LOG.warn(msg, e);
+ }
+ throw e;
+ }
+ }
+
+ /**
+ * Get list of minor device numbers of Gpu devices usable by YARN.
+ *
+ * @return List of minor device numbers of Gpu devices.
+ * @throws YarnException when any issue happens
+ */
+ public synchronized List<Integer> getMinorNumbersOfGpusUsableByYarn()
+ throws YarnException {
+ validateConfOrThrowException();
+
+ String allowedDevicesStr = conf.get(
+ YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
+ YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
+
+ List<Integer> minorNumbers = new ArrayList<>();
+
+ if (allowedDevicesStr.equals(
+ YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
+ // Get gpu device information from system.
+ if (null == lastDiscoveredGpuInformation) {
+ String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to "
+ + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES
+ + ", however automatically discovering "
+ + "GPU information failed, please check NodeManager log for more"
+ + " details, as an alternative, admin can specify "
+ + YarnConfiguration.NM_GPU_ALLOWED_DEVICES
+ + " manually to enable GPU isolation.";
+ LOG.error(msg);
+ throw new YarnException(msg);
+ }
+
+ if (lastDiscoveredGpuInformation.getGpus() != null) {
+ for (PerGpuDeviceInformation gpu : lastDiscoveredGpuInformation
+ .getGpus()) {
+ minorNumbers.add(gpu.getMinorNumber());
+ }
+ }
+ } else{
+ for (String s : allowedDevicesStr.split(",")) {
+ if (s.trim().length() > 0) {
+ minorNumbers.add(Integer.valueOf(s.trim()));
+ }
+ }
+ LOG.info("Allowed GPU devices with minor numbers:" + allowedDevicesStr);
+ }
+
+ return minorNumbers;
+ }
+
+ public synchronized void initialize(Configuration conf) throws YarnException {
+ this.conf = conf;
+ numOfErrorExecutionSinceLastSucceed = 0;
+ String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC,
+ YarnConfiguration.DEFAULT_NM_GPU_PATH_TO_EXEC);
+ if (pathToExecutable.isEmpty()) {
+ pathToExecutable = DEFAULT_BINARY_NAME;
+ }
+
+ // Validate file existence
+ File binaryPath = new File(pathToExecutable);
+
+ if (!binaryPath.exists()) {
+ // When binary not exist, use default setting.
+ boolean found = false;
+ for (String dir : DEFAULT_BINARY_SEARCH_DIRS) {
+ binaryPath = new File(dir, DEFAULT_BINARY_NAME);
+ if (binaryPath.exists()) {
+ found = true;
+ pathOfGpuBinary = binaryPath.getAbsolutePath();
+ break;
+ }
+ }
+
+ if (!found) {
+ LOG.warn("Failed to locate binary at:" + binaryPath.getAbsolutePath()
+ + ", please double check [" + YarnConfiguration.NM_GPU_PATH_TO_EXEC
+ + "] setting. Now use " + "default binary:" + DEFAULT_BINARY_NAME);
+ }
+ } else{
+ // If path specified by user is a directory, use
+ if (binaryPath.isDirectory()) {
+ binaryPath = new File(binaryPath, DEFAULT_BINARY_NAME);
+ LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME
+ + " under the directory, updated path-to-executable:" + binaryPath
+ .getAbsolutePath());
+ }
+ // Validated
+ pathOfGpuBinary = binaryPath.getAbsolutePath();
+ }
+
+ // Try to discover GPU information once and print
+ try {
+ LOG.info("Trying to discover GPU information ...");
+ GpuDeviceInformation info = getGpuDeviceInformation();
+ LOG.info(info.toString());
+ } catch (YarnException e) {
+ String msg =
+ "Failed to discover GPU information from system, exception message:"
+ + e.getMessage() + " continue...";
+ LOG.warn(msg);
+ }
+ }
+
+ @VisibleForTesting
+ protected Map<String, String> getEnvironmentToRunCommand() {
+ return environment;
+ }
+
+ @VisibleForTesting
+ protected String getPathOfGpuBinary() {
+ return pathOfGpuBinary;
+ }
+
+ public static GpuDiscoverer getInstance() {
+ return instance;
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
new file mode 100644
index 0000000..f6bf506
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
+import org.apache.hadoop.yarn.util.resource.ResourceUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.List;
+import java.util.Map;
+
+import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
+
+public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
+ private static final Logger LOG =
+ LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
+
+ @Override
+ public void updateConfiguredResource(Resource res) throws YarnException {
+ LOG.info("Initializing configured GPU resources for the NodeManager.");
+
+ List<Integer> usableGpus =
+ GpuDiscoverer.getInstance().getMinorNumbersOfGpusUsableByYarn();
+ if (null == usableGpus || usableGpus.isEmpty()) {
+ LOG.info("Didn't find any usable GPUs on the NodeManager.");
+ // No gpu can be used by YARN.
+ return;
+ }
+
+ long nUsableGpus = usableGpus.size();
+
+ Map<String, ResourceInformation> configuredResourceTypes =
+ ResourceUtils.getResourceTypes();
+ if (!configuredResourceTypes.containsKey(GPU_URI)) {
+ throw new YarnException("Found " + nUsableGpus + " usable GPUs, however "
+ + GPU_URI
+ + " resource-type is not configured inside"
+ + " resource-types.xml, please configure it to enable GPU feature or"
+ + " remove " + GPU_URI + " from "
+ + YarnConfiguration.NM_RESOURCE_PLUGINS);
+ }
+
+ res.setResourceValue(GPU_URI, nUsableGpus);
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
new file mode 100644
index 0000000..9576ce7
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceHandlerImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
+
+public class GpuResourcePlugin implements ResourcePlugin {
+ private ResourceHandler gpuResourceHandler = null;
+ private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null;
+
+ @Override
+ public synchronized void initialize(Context context) throws YarnException {
+ resourceDiscoverHandler = new GpuNodeResourceUpdateHandler();
+ GpuDiscoverer.getInstance().initialize(context.getConf());
+ }
+
+ @Override
+ public synchronized ResourceHandler createResourceHandler(
+ Context context, CGroupsHandler cGroupsHandler,
+ PrivilegedOperationExecutor privilegedOperationExecutor) {
+ if (gpuResourceHandler == null) {
+ gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler,
+ privilegedOperationExecutor);
+ }
+
+ return gpuResourceHandler;
+ }
+
+ @Override
+ public synchronized NodeResourceUpdaterPlugin getNodeResourceHandlerInstance() {
+ return resourceDiscoverHandler;
+ }
+
+ @Override
+ public void cleanup() throws YarnException {
+ // Do nothing.
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java
new file mode 100644
index 0000000..977032a
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlRootElement;
+import java.util.List;
+
+/**
+ * All GPU Device Information in the system.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "nvidia_smi_log")
+public class GpuDeviceInformation {
+ List<PerGpuDeviceInformation> gpus;
+
+ String driverVersion = "N/A";
+
+ // More fields like topology information could be added when needed.
+ // ...
+
+ @javax.xml.bind.annotation.XmlElement(name = "gpu")
+ public List<PerGpuDeviceInformation> getGpus() {
+ return gpus;
+ }
+
+ public void setGpus(List<PerGpuDeviceInformation> gpus) {
+ this.gpus = gpus;
+ }
+
+ @javax.xml.bind.annotation.XmlElement(name = "driver_version")
+ public String getDriverVersion() {
+ return driverVersion;
+ }
+
+ public void setDriverVersion(String driverVersion) {
+ this.driverVersion = driverVersion;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("=== Gpus in the system ===\n").append("\tDriver Version:").append(
+ getDriverVersion()).append("\n");
+
+ if (gpus != null) {
+ for (PerGpuDeviceInformation gpu : gpus) {
+ sb.append("\t").append(gpu.toString()).append("\n");
+ }
+ }
+ return sb.toString();
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
new file mode 100644
index 0000000..1bd92f6
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+import javax.xml.bind.JAXBContext;
+import javax.xml.bind.JAXBException;
+import javax.xml.bind.Unmarshaller;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParserFactory;
+import javax.xml.transform.sax.SAXSource;
+import java.io.StringReader;
+
+/**
+ * Parse XML and get GPU device information
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public class GpuDeviceInformationParser {
+ private static final Logger LOG = LoggerFactory.getLogger(
+ GpuDeviceInformationParser.class);
+
+ private Unmarshaller unmarshaller = null;
+ private XMLReader xmlReader = null;
+
+ private void init()
+ throws SAXException, ParserConfigurationException, JAXBException {
+ SAXParserFactory spf = SAXParserFactory.newInstance();
+ // Disable external-dtd since by default nvidia-smi output contains
+ // <!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v8.dtd"> in header
+ spf.setFeature(
+ "http://apache.org/xml/features/nonvalidating/load-external-dtd",
+ false);
+ spf.setFeature("http://xml.org/sax/features/validation", false);
+
+ JAXBContext jaxbContext = JAXBContext.newInstance(
+ GpuDeviceInformation.class);
+
+ this.xmlReader = spf.newSAXParser().getXMLReader();
+ this.unmarshaller = jaxbContext.createUnmarshaller();
+ }
+
+ public synchronized GpuDeviceInformation parseXml(String xmlContent)
+ throws YarnException {
+ if (unmarshaller == null) {
+ try {
+ init();
+ } catch (SAXException | ParserConfigurationException | JAXBException e) {
+ LOG.error("Exception while initialize parser", e);
+ throw new YarnException(e);
+ }
+ }
+
+ InputSource inputSource = new InputSource(new StringReader(xmlContent));
+ SAXSource source = new SAXSource(xmlReader, inputSource);
+ try {
+ return (GpuDeviceInformation) unmarshaller.unmarshal(source);
+ } catch (JAXBException e) {
+ LOG.error("Exception while parsing xml", e);
+ throw new YarnException(e);
+ }
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java
new file mode 100644
index 0000000..f315313
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java
@@ -0,0 +1,165 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlAdapter;
+
+/**
+ * Capture single GPU device information such as memory size, temperature,
+ * utilization.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "gpu")
+public class PerGpuDeviceInformation {
+
+ private String productName = "N/A";
+ private String uuid = "N/A";
+ private int minorNumber = -1;
+
+ private PerGpuUtilizations gpuUtilizations;
+ private PerGpuMemoryUsage gpuMemoryUsage;
+ private PerGpuTemperature temperature;
+
+ /**
+ * Convert formats like "34 C", "75.6 %" to float.
+ */
+ @InterfaceAudience.Private
+ @InterfaceStability.Unstable
+ static class StrToFloatBeforeSpaceAdapter extends
+ XmlAdapter<String, Float> {
+ @Override
+ public String marshal(Float v) throws Exception {
+ if (v == null) {
+ return "";
+ }
+ return String.valueOf(v);
+ }
+
+ @Override
+ public Float unmarshal(String v) throws Exception {
+ if (v == null) {
+ return -1f;
+ }
+
+ return Float.valueOf(v.split(" ")[0]);
+ }
+ }
+
+ /**
+ * Convert formats like "725 MiB" to long.
+ */
+ @InterfaceAudience.Private
+ @InterfaceStability.Unstable
+ static class StrToMemAdapter extends XmlAdapter<String, Long> {
+ @Override
+ public String marshal(Long v) throws Exception {
+ if (v == null) {
+ return "";
+ }
+ return String.valueOf(v) + " MiB";
+ }
+
+ @Override
+ public Long unmarshal(String v) throws Exception {
+ if (v == null) {
+ return -1L;
+ }
+ return Long.valueOf(v.split(" ")[0]);
+ }
+ }
+
+ @XmlElement(name = "temperature")
+ public PerGpuTemperature getTemperature() {
+ return temperature;
+ }
+
+ public void setTemperature(PerGpuTemperature temperature) {
+ this.temperature = temperature;
+ }
+
+ @XmlElement(name = "uuid")
+ public String getUuid() {
+ return uuid;
+ }
+
+ public void setUuid(String uuid) {
+ this.uuid = uuid;
+ }
+
+ @XmlElement(name = "product_name")
+ public String getProductName() {
+ return productName;
+ }
+
+ public void setProductName(String productName) {
+ this.productName = productName;
+ }
+
+ @XmlElement(name = "minor_number")
+ public int getMinorNumber() {
+ return minorNumber;
+ }
+
+ public void setMinorNumber(int minorNumber) {
+ this.minorNumber = minorNumber;
+ }
+
+ @XmlElement(name = "utilization")
+ public PerGpuUtilizations getGpuUtilizations() {
+ return gpuUtilizations;
+ }
+
+ public void setGpuUtilizations(PerGpuUtilizations utilizations) {
+ this.gpuUtilizations = utilizations;
+ }
+
+ @XmlElement(name = "bar1_memory_usage")
+ public PerGpuMemoryUsage getGpuMemoryUsage() {
+ return gpuMemoryUsage;
+ }
+
+ public void setGpuMemoryUsage(PerGpuMemoryUsage gpuMemoryUsage) {
+ this.gpuMemoryUsage = gpuMemoryUsage;
+ }
+
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("ProductName=").append(productName).append(", MinorNumber=")
+ .append(minorNumber);
+
+ if (getGpuMemoryUsage() != null) {
+ sb.append(", TotalMemory=").append(
+ getGpuMemoryUsage().getTotalMemoryMiB()).append("MiB");
+ }
+
+ if (getGpuUtilizations() != null) {
+ sb.append(", Utilization=").append(
+ getGpuUtilizations().getOverallGpuUtilization()).append("%");
+ }
+ return sb.toString();
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java
new file mode 100644
index 0000000..3964c4e
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
+
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "bar1_memory_usage")
+public class PerGpuMemoryUsage {
+ long usedMemoryMiB = -1L;
+ long availMemoryMiB = -1L;
+
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class)
+ @XmlElement(name = "used")
+ public Long getUsedMemoryMiB() {
+ return usedMemoryMiB;
+ }
+
+ public void setUsedMemoryMiB(Long usedMemoryMiB) {
+ this.usedMemoryMiB = usedMemoryMiB;
+ }
+
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class)
+ @XmlElement(name = "free")
+ public Long getAvailMemoryMiB() {
+ return availMemoryMiB;
+ }
+
+ public void setAvailMemoryMiB(Long availMemoryMiB) {
+ this.availMemoryMiB = availMemoryMiB;
+ }
+
+ public long getTotalMemoryMiB() {
+ return usedMemoryMiB + availMemoryMiB;
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java
new file mode 100644
index 0000000..ccd60cb
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
+
+/**
+ * Temperature of GPU
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "temperature")
+public class PerGpuTemperature {
+ private float currentGpuTemp = Float.MIN_VALUE;
+ private float maxGpuTemp = Float.MIN_VALUE;
+ private float slowThresholdGpuTemp = Float.MIN_VALUE;
+
+ /**
+ * Get current celsius GPU temperature
+ * @return temperature
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_temp")
+ public Float getCurrentGpuTemp() {
+ return currentGpuTemp;
+ }
+
+ public void setCurrentGpuTemp(Float currentGpuTemp) {
+ this.currentGpuTemp = currentGpuTemp;
+ }
+
+ /**
+ * Get max possible celsius GPU temperature
+ * @return temperature
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_temp_max_threshold")
+ public Float getMaxGpuTemp() {
+ return maxGpuTemp;
+ }
+
+ public void setMaxGpuTemp(Float maxGpuTemp) {
+ this.maxGpuTemp = maxGpuTemp;
+ }
+
+ /**
+ * Get celsius GPU temperature which could make GPU runs slower
+ * @return temperature
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_temp_slow_threshold")
+ public Float getSlowThresholdGpuTemp() {
+ return slowThresholdGpuTemp;
+ }
+
+ public void setSlowThresholdGpuTemp(Float slowThresholdGpuTemp) {
+ this.slowThresholdGpuTemp = slowThresholdGpuTemp;
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java
new file mode 100644
index 0000000..4ef218b
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
+
+/**
+ * GPU utilizations
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "utilization")
+public class PerGpuUtilizations {
+ private float overallGpuUtilization;
+
+ /**
+ * Overall percent GPU utilization
+ * @return utilization
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_util")
+ public Float getOverallGpuUtilization() {
+ return overallGpuUtilization;
+ }
+
+ public void setOverallGpuUtilization(Float overallGpuUtilization) {
+ this.overallGpuUtilization = overallGpuUtilization;
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java
new file mode 100644
index 0000000..13b3ee9
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java
@@ -0,0 +1,164 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.net.ServerSocketUtil;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.event.Dispatcher;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.factories.RecordFactory;
+import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
+import org.apache.hadoop.yarn.server.api.ResourceTracker;
+import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest;
+import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
+import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
+import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
+import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerRequest;
+import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerResponse;
+import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.NodeHeartbeatResponsePBImpl;
+import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.RegisterNodeManagerResponsePBImpl;
+import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.UnRegisterNodeManagerResponsePBImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
+import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
+import org.junit.Assert;
+import org.junit.Before;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+public class NodeManagerTestBase {
+ // temp fix until metrics system can auto-detect itself running in unit test:
+ static {
+ DefaultMetricsSystem.setMiniClusterMode(true);
+ }
+
+ protected static final Logger LOG =
+ LoggerFactory.getLogger(TestNodeStatusUpdater.class);
+ protected static final File basedir =
+ new File("target", TestNodeStatusUpdater.class.getName());
+ protected static final File nmLocalDir = new File(basedir, "nm0");
+ protected static final File tmpDir = new File(basedir, "tmpDir");
+ protected static final File remoteLogsDir = new File(basedir, "remotelogs");
+ protected static final File logsDir = new File(basedir, "logs");
+ protected static final RecordFactory recordFactory = RecordFactoryProvider
+ .getRecordFactory(null);
+ protected Configuration conf;
+
+ protected YarnConfiguration createNMConfig() throws IOException {
+ return createNMConfig(ServerSocketUtil.getPort(49170, 10));
+ }
+
+ protected YarnConfiguration createNMConfig(int port) throws IOException {
+ YarnConfiguration conf = new YarnConfiguration();
+ String localhostAddress = null;
+ try {
+ localhostAddress = InetAddress.getByName("localhost")
+ .getCanonicalHostName();
+ } catch (UnknownHostException e) {
+ Assert.fail("Unable to get localhost address: " + e.getMessage());
+ }
+ conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB
+ conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port);
+ conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":"
+ + ServerSocketUtil.getPort(49160, 10));
+ conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
+ conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
+ remoteLogsDir.getAbsolutePath());
+ conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
+ conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
+ return conf;
+ }
+
+ public static class BaseResourceTrackerForTest implements ResourceTracker {
+ @Override
+ public RegisterNodeManagerResponse registerNodeManager(
+ RegisterNodeManagerRequest request) throws YarnException, IOException {
+ return new RegisterNodeManagerResponsePBImpl();
+ }
+
+ @Override
+ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request)
+ throws YarnException, IOException {
+ return new NodeHeartbeatResponsePBImpl();
+ }
+
+ @Override
+ public UnRegisterNodeManagerResponse unRegisterNodeManager(
+ UnRegisterNodeManagerRequest request)
+ throws YarnException, IOException {
+ return new UnRegisterNodeManagerResponsePBImpl();
+ }
+ }
+
+ protected static class BaseNodeStatusUpdaterForTest extends NodeStatusUpdaterImpl {
+ public ResourceTracker resourceTracker;
+ protected Context context;
+
+ public BaseNodeStatusUpdaterForTest(Context context, Dispatcher dispatcher,
+ NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
+ ResourceTracker resourceTracker) {
+ super(context, dispatcher, healthChecker, metrics);
+ this.context = context;
+ this.resourceTracker = resourceTracker;
+ }
+ @Override
+ protected ResourceTracker getRMClient() {
+ return resourceTracker;
+ }
+
+ @Override
+ protected void stopRMProxy() {
+ return;
+ }
+ }
+
+ public class MyContainerManager extends ContainerManagerImpl {
+ public boolean signaled = false;
+
+ public MyContainerManager(Context context, ContainerExecutor exec,
+ DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
+ NodeManagerMetrics metrics,
+ LocalDirsHandlerService dirsHandler) {
+ super(context, exec, deletionContext, nodeStatusUpdater,
+ metrics, dirsHandler);
+ }
+
+ @Override
+ public void handle(ContainerManagerEvent event) {
+ if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) {
+ signaled = true;
+ }
+ }
+ }
+
+ @Before
+ public void setUp() throws IOException {
+ nmLocalDir.mkdirs();
+ tmpDir.mkdirs();
+ logsDir.mkdirs();
+ remoteLogsDir.mkdirs();
+ conf = createNMConfig();
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
index 2e9eff5..9b180c7 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
@@ -178,7 +178,7 @@ public class TestDefaultContainerExecutor {
FileContext lfs = FileContext.getLocalFSFileContext(conf);
DefaultContainerExecutor executor = new DefaultContainerExecutor(lfs);
executor.setConf(conf);
- executor.init();
+ executor.init(null);
try {
executor.createUserLocalDirs(localDirs, user);
@@ -317,7 +317,7 @@ public class TestDefaultContainerExecutor {
Path workDir = localDir;
Path pidFile = new Path(workDir, "pid.txt");
- mockExec.init();
+ mockExec.init(null);
mockExec.activateContainer(cId, pidFile);
int ret = mockExec.launchContainer(new ContainerStartContext.Builder()
.setContainer(container)
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDockerContainerExecutorWithMocks.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDockerContainerExecutorWithMocks.java
index f1194c9..7e1752b 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDockerContainerExecutorWithMocks.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDockerContainerExecutorWithMocks.java
@@ -116,7 +116,7 @@ public class TestDockerContainerExecutorWithMocks {
public void testContainerInitSecure() throws IOException {
dockerContainerExecutor.getConf().set(
CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION, "kerberos");
- dockerContainerExecutor.init();
+ dockerContainerExecutor.init(mock(Context.class));
}
@Test(expected = IllegalArgumentException.class)
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
index cf8d977..95c8f5e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
@@ -628,7 +628,7 @@ public class TestLinuxContainerExecutor {
LinuxContainerExecutor lce = new LinuxContainerExecutor();
lce.setConf(conf);
try {
- lce.init();
+ lce.init(null);
} catch (IOException e) {
// expected if LCE isn't setup right, but not necessary for this test
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
index 79b88cf..249e017 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
@@ -426,7 +426,7 @@ public class TestLinuxContainerExecutorWithMocks {
@Test
public void testInit() throws Exception {
- mockExec.init();
+ mockExec.init(mock(Context.class));
assertEquals(Arrays.asList("--checksetup"), readMockParams());
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
index 9279711..b31215b 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
@@ -37,7 +37,7 @@ public class TestNodeManager {
public static final class InvalidContainerExecutor extends
DefaultContainerExecutor {
@Override
- public void init() throws IOException {
+ public void init(Context nmContext) throws IOException {
throw new IOException("dummy executor init called");
}
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
index 055dab4..533cf2a 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
@@ -20,16 +20,14 @@ package org.apache.hadoop.yarn.server.nodemanager;
import static org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils.newNodeHeartbeatResponse;
import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
-import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
@@ -80,8 +78,6 @@ import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
-import org.apache.hadoop.yarn.factories.RecordFactory;
-import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NodeHeartbeatResponseProto;
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.ResourceTracker;
@@ -117,41 +113,14 @@ import org.junit.Before;
import org.junit.Test;
@SuppressWarnings("rawtypes")
-public class TestNodeStatusUpdater {
-
- // temp fix until metrics system can auto-detect itself running in unit test:
- static {
- DefaultMetricsSystem.setMiniClusterMode(true);
- }
-
- static final Logger LOG =
- LoggerFactory.getLogger(TestNodeStatusUpdater.class);
- static final File basedir =
- new File("target", TestNodeStatusUpdater.class.getName());
- static final File nmLocalDir = new File(basedir, "nm0");
- static final File tmpDir = new File(basedir, "tmpDir");
- static final File remoteLogsDir = new File(basedir, "remotelogs");
- static final File logsDir = new File(basedir, "logs");
- private static final RecordFactory recordFactory = RecordFactoryProvider
- .getRecordFactory(null);
-
+public class TestNodeStatusUpdater extends NodeManagerTestBase {
volatile int heartBeatID = 0;
volatile Throwable nmStartError = null;
private final List<NodeId> registeredNodes = new ArrayList<NodeId>();
private boolean triggered = false;
- private Configuration conf;
private NodeManager nm;
private AtomicBoolean assertionFailedInThread = new AtomicBoolean(false);
- @Before
- public void setUp() throws IOException {
- nmLocalDir.mkdirs();
- tmpDir.mkdirs();
- logsDir.mkdirs();
- remoteLogsDir.mkdirs();
- conf = createNMConfig();
- }
-
@After
public void tearDown() {
this.registeredNodes.clear();
@@ -332,29 +301,7 @@ public class TestNodeStatusUpdater {
}
}
- private class MyContainerManager extends ContainerManagerImpl {
- public boolean signaled = false;
-
- public MyContainerManager(Context context, ContainerExecutor exec,
- DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
- NodeManagerMetrics metrics,
- LocalDirsHandlerService dirsHandler) {
- super(context, exec, deletionContext, nodeStatusUpdater,
- metrics, dirsHandler);
- }
-
- @Override
- public void handle(ContainerManagerEvent event) {
- if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) {
- signaled = true;
- }
- }
- }
-
- private class MyNodeStatusUpdater extends NodeStatusUpdaterImpl {
- public ResourceTracker resourceTracker;
- private Context context;
-
+ private class MyNodeStatusUpdater extends BaseNodeStatusUpdaterForTest {
public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) {
this(context, dispatcher, healthChecker, metrics, false);
@@ -363,19 +310,8 @@ public class TestNodeStatusUpdater {
public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
boolean signalContainer) {
- super(context, dispatcher, healthChecker, metrics);
- this.context = context;
- resourceTracker = new MyResourceTracker(this.context, signalContainer);
- }
-
- @Override
- protected ResourceTracker getRMClient() {
- return resourceTracker;
- }
-
- @Override
- protected void stopRMProxy() {
- return;
+ super(context, dispatcher, healthChecker, metrics,
+ new MyResourceTracker(context, signalContainer));
}
}
@@ -1818,7 +1754,6 @@ public class TestNodeStatusUpdater {
Assert.assertTrue("Test failed with exception(s)" + exceptions,
exceptions.isEmpty());
}
-
// Add new containers info into NM context each time node heart beats.
private class MyNMContext extends NMContext {
@@ -1922,31 +1857,6 @@ public class TestNodeStatusUpdater {
this.registeredNodes.size());
}
- private YarnConfiguration createNMConfig(int port) throws IOException {
- YarnConfiguration conf = new YarnConfiguration();
- String localhostAddress = null;
- try {
- localhostAddress = InetAddress.getByName("localhost")
- .getCanonicalHostName();
- } catch (UnknownHostException e) {
- Assert.fail("Unable to get localhost address: " + e.getMessage());
- }
- conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB
- conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port);
- conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":"
- + ServerSocketUtil.getPort(49160, 10));
- conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
- conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
- remoteLogsDir.getAbsolutePath());
- conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
- conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
- return conf;
- }
-
- private YarnConfiguration createNMConfig() throws IOException {
- return createNMConfig(ServerSocketUtil.getPort(49170, 10));
- }
-
private NodeManager getNodeManager(final NodeAction nodeHeartBeatAction) {
return new NodeManager() {
@Override
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
index 3c432d3..4b4f356 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
@@ -18,26 +18,6 @@
package org.apache.hadoop.yarn.server.nodemanager.amrmproxy;
-import java.io.IOException;
-import java.security.PrivilegedExceptionAction;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.ExecutorCompletionService;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
@@ -66,6 +46,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport;
import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
@@ -74,18 +55,37 @@ import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredAMRMProxyState;
-import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM;
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
+import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.util.Records;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.PrivilegedExceptionAction;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
/**
* Base class for all the AMRMProxyService test cases. It provides utility
@@ -805,5 +805,9 @@ public abstract class BaseAMRMProxyTest {
public NMTimelinePublisher getNMTimelinePublisher() {
return null;
}
+
+ public ResourcePluginManager getResourcePluginManager() {
+ return null;
+ }
}
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
index e5414a5..0563694 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
@@ -22,6 +22,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resourc
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
@@ -30,6 +31,8 @@ import org.slf4j.LoggerFactory;
import java.util.List;
+import static org.mockito.Mockito.mock;
+
public class TestResourceHandlerModule {
private static final Logger LOG =
LoggerFactory.getLogger(TestResourceHandlerModule.class);
@@ -62,7 +65,7 @@ public class TestResourceHandlerModule {
//Ensure that outbound bandwidth resource handler is present in the chain
ResourceHandlerChain resourceHandlerChain = ResourceHandlerModule
- .getConfiguredResourceHandlerChain(networkEnabledConf);
+ .getConfiguredResourceHandlerChain(networkEnabledConf, mock(Context.class));
List<ResourceHandler> resourceHandlers = resourceHandlerChain
.getResourceHandlerList();
//Exactly one resource handler in chain
@@ -88,7 +91,8 @@ public class TestResourceHandlerModule {
Assert.assertNotNull(handler);
ResourceHandlerChain resourceHandlerChain =
- ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf);
+ ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf,
+ mock(Context.class));
List<ResourceHandler> resourceHandlers =
resourceHandlerChain.getResourceHandlerList();
// Exactly one resource handler in chain
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
new file mode 100644
index 0000000..1c4313c
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
@@ -0,0 +1,385 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes;
+import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
+import org.apache.hadoop.yarn.api.records.ApplicationId;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
+import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
+import org.apache.hadoop.yarn.util.resource.ResourceUtils;
+import org.apache.hadoop.yarn.util.resource.TestResourceUtils;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.anyList;
+import static org.mockito.Matchers.anyListOf;
+import static org.mockito.Matchers.anyString;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.doThrow;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+public class TestGpuResourceHandler {
+ private CGroupsHandler mockCGroupsHandler;
+ private PrivilegedOperationExecutor mockPrivilegedExecutor;
+ private GpuResourceHandlerImpl gpuResourceHandler;
+ private NMStateStoreService mockNMStateStore;
+ private ConcurrentHashMap<ContainerId, Container> runningContainersMap;
+
+ @Before
+ public void setup() {
+ TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
+
+ mockCGroupsHandler = mock(CGroupsHandler.class);
+ mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class);
+ mockNMStateStore = mock(NMStateStoreService.class);
+
+ Context nmctx = mock(Context.class);
+ when(nmctx.getNMStateStore()).thenReturn(mockNMStateStore);
+ runningContainersMap = new ConcurrentHashMap<>();
+ when(nmctx.getContainers()).thenReturn(runningContainersMap);
+
+ gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler,
+ mockPrivilegedExecutor);
+ }
+
+ @Test
+ public void testBootStrap() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0");
+
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ verify(mockCGroupsHandler, times(1)).initializeCGroupController(
+ CGroupsHandler.CGroupController.DEVICES);
+ }
+
+ private static ContainerId getContainerId(int id) {
+ return ContainerId.newContainerId(ApplicationAttemptId
+ .newInstance(ApplicationId.newInstance(1234L, 1), 1), id);
+ }
+
+ private static Container mockContainerWithGpuRequest(int id,
+ int numGpuRequest) {
+ Container c = mock(Container.class);
+ when(c.getContainerId()).thenReturn(getContainerId(id));
+
+ Resource res = Resource.newInstance(1024, 1);
+ ResourceMappings resMapping = new ResourceMappings();
+
+ res.setResourceValue(ResourceInformation.GPU_URI, numGpuRequest);
+ when(c.getResource()).thenReturn(res);
+ when(c.getResourceMappings()).thenReturn(resMapping);
+ return c;
+ }
+
+ private void verifyDeniedDevices(ContainerId containerId,
+ List<Integer> deniedDevices)
+ throws ResourceHandlerException, PrivilegedOperationException {
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, containerId.toString());
+
+ if (null != deniedDevices && !deniedDevices.isEmpty()) {
+ verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation(
+ new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays
+ .asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION,
+ containerId.toString(),
+ GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION,
+ StringUtils.join(",", deniedDevices))), true);
+ }
+ }
+
+ @Test
+ public void testAllocation() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Start container 1, asks 3 containers */
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
+
+ // Only device=4 will be blocked.
+ verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
+
+ /* Start container 2, asks 2 containers. Excepted to fail */
+ boolean failedToAllocate = false;
+ try {
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 2));
+ } catch (ResourceHandlerException e) {
+ failedToAllocate = true;
+ }
+ Assert.assertTrue(failedToAllocate);
+
+ /* Start container 3, ask 1 container, succeeded */
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(3, 1));
+
+ // devices = 0/1/3 will be blocked
+ verifyDeniedDevices(getContainerId(3), Arrays.asList(0, 1, 3));
+
+ /* Start container 4, ask 0 container, succeeded */
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(4, 0));
+
+ // All devices will be blocked
+ verifyDeniedDevices(getContainerId(4), Arrays.asList(0, 1, 3, 4));
+
+ /* Release container-1, expect cgroups deleted */
+ gpuResourceHandler.postComplete(getContainerId(1));
+
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
+ Assert.assertEquals(3,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Release container-3, expect cgroups deleted */
+ gpuResourceHandler.postComplete(getContainerId(3));
+
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, getContainerId(3).toString());
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+ }
+
+ @SuppressWarnings("unchecked")
+ @Test
+ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
+ throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ doThrow(new IOException("Exception ...")).when(mockNMStateStore)
+ .storeAssignedResources(
+ any(ContainerId.class), anyString(), anyList());
+
+ boolean exception = false;
+ /* Start container 1, asks 3 containers */
+ try {
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
+ } catch (ResourceHandlerException e) {
+ exception = true;
+ }
+
+ Assert.assertTrue("preStart should throw exception", exception);
+
+ // After preStart, we still have 4 available GPU since the store op fails.
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+ }
+
+ @Test
+ public void testAllocationWithoutAllowedGpus() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(0,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Start container 1, asks 0 containers */
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0));
+ verifyDeniedDevices(getContainerId(1), Collections.<Integer>emptyList());
+
+ /* Start container 2, asks 1 containers. Excepted to fail */
+ boolean failedToAllocate = false;
+ try {
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 1));
+ } catch (ResourceHandlerException e) {
+ failedToAllocate = true;
+ }
+ Assert.assertTrue(failedToAllocate);
+
+ /* Release container 1, expect cgroups deleted */
+ gpuResourceHandler.postComplete(getContainerId(1));
+
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
+ Assert.assertEquals(0,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+ }
+
+ @Test
+ public void testAllocationStored() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Start container 1, asks 3 containers */
+ Container container = mockContainerWithGpuRequest(1, 3);
+ gpuResourceHandler.preStart(container);
+
+ verify(mockNMStateStore).storeAssignedResources(getContainerId(1),
+ ResourceInformation.GPU_URI,
+ Arrays.<Serializable>asList("0", "1", "3"));
+
+ Assert.assertEquals(3, container.getResourceMappings()
+ .getAssignedResources(ResourceInformation.GPU_URI).size());
+
+ // Only device=4 will be blocked.
+ verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
+
+ /* Start container 2, ask 0 container, succeeded */
+ container = mockContainerWithGpuRequest(2, 0);
+ gpuResourceHandler.preStart(container);
+
+ verifyDeniedDevices(getContainerId(2), Arrays.asList(0, 1, 3, 4));
+ Assert.assertEquals(0, container.getResourceMappings()
+ .getAssignedResources(ResourceInformation.GPU_URI).size());
+
+ // Store assigned resource will not be invoked.
+ verify(mockNMStateStore, never()).storeAssignedResources(
+ eq(getContainerId(2)), eq(ResourceInformation.GPU_URI),
+ anyListOf(Serializable.class));
+ }
+
+ @Test
+ public void testRecoverResourceAllocation() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ Container nmContainer = mock(Container.class);
+ ResourceMappings rmap = new ResourceMappings();
+ ResourceMappings.AssignedResources ar =
+ new ResourceMappings.AssignedResources();
+ ar.updateAssignedResources(Arrays.<Serializable>asList("1", "3"));
+ rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
+ when(nmContainer.getResourceMappings()).thenReturn(rmap);
+
+ runningContainersMap.put(getContainerId(1), nmContainer);
+
+ // TEST CASE
+ // Reacquire container restore state of GPU Resource Allocator.
+ gpuResourceHandler.reacquireContainer(getContainerId(1));
+
+ Map<Integer, ContainerId> deviceAllocationMapping =
+ gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
+ Assert.assertEquals(2, deviceAllocationMapping.size());
+ Assert.assertTrue(
+ deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
+ Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+
+ // TEST CASE
+ // Try to reacquire a container but requested device is not in allowed list.
+ nmContainer = mock(Container.class);
+ rmap = new ResourceMappings();
+ ar = new ResourceMappings.AssignedResources();
+ // id=5 is not in allowed list.
+ ar.updateAssignedResources(Arrays.<Serializable>asList("4", "5"));
+ rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
+ when(nmContainer.getResourceMappings()).thenReturn(rmap);
+
+ runningContainersMap.put(getContainerId(2), nmContainer);
+
+ boolean caughtException = false;
+ try {
+ gpuResourceHandler.reacquireContainer(getContainerId(1));
+ } catch (ResourceHandlerException e) {
+ caughtException = true;
+ }
+ Assert.assertTrue(
+ "Should fail since requested device Id is not in allowed list",
+ caughtException);
+
+ // Make sure internal state not changed.
+ deviceAllocationMapping =
+ gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
+ Assert.assertEquals(2, deviceAllocationMapping.size());
+ Assert.assertTrue(
+ deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
+ Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+
+ // TEST CASE
+ // Try to reacquire a container but requested device is already assigned.
+ nmContainer = mock(Container.class);
+ rmap = new ResourceMappings();
+ ar = new ResourceMappings.AssignedResources();
+ // id=3 is already assigned
+ ar.updateAssignedResources(Arrays.<Serializable>asList("4", "3"));
+ rmap.addAssignedResources("gpu", ar);
+ when(nmContainer.getResourceMappings()).thenReturn(rmap);
+
+ runningContainersMap.put(getContainerId(2), nmContainer);
+
+ caughtException = false;
+ try {
+ gpuResourceHandler.reacquireContainer(getContainerId(1));
+ } catch (ResourceHandlerException e) {
+ caughtException = true;
+ }
+ Assert.assertTrue(
+ "Should fail since requested device Id is not in allowed list",
+ caughtException);
+
+ // Make sure internal state not changed.
+ deviceAllocationMapping =
+ gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
+ Assert.assertEquals(2, deviceAllocationMapping.size());
+ Assert.assertTrue(
+ deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
+ Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
index 318ae6b..a147afb 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
@@ -70,7 +70,7 @@ public class TestContainersMonitorResourceChange {
private static class MockExecutor extends ContainerExecutor {
@Override
- public void init() throws IOException {
+ public void init(Context nmContext) throws IOException {
}
@Override
public void startLocalizer(LocalizerStartContext ctx)
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java
new file mode 100644
index 0000000..bcadf76
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java
@@ -0,0 +1,261 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.service.ServiceOperations;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.event.Dispatcher;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
+import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
+import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
+import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
+import org.apache.hadoop.yarn.server.nodemanager.NodeManagerTestBase;
+import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
+import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+public class TestResourcePluginManager extends NodeManagerTestBase {
+ private NodeManager nm;
+
+ ResourcePluginManager stubResourcePluginmanager() {
+ // Stub ResourcePluginManager
+ final ResourcePluginManager rpm = mock(ResourcePluginManager.class);
+ Map<String, ResourcePlugin> plugins = new HashMap<>();
+
+ // First resource plugin
+ ResourcePlugin resourcePlugin = mock(ResourcePlugin.class);
+ NodeResourceUpdaterPlugin nodeResourceUpdaterPlugin = mock(
+ NodeResourceUpdaterPlugin.class);
+ when(resourcePlugin.getNodeResourceHandlerInstance()).thenReturn(
+ nodeResourceUpdaterPlugin);
+ plugins.put("resource1", resourcePlugin);
+
+ // Second resource plugin
+ resourcePlugin = mock(ResourcePlugin.class);
+ when(resourcePlugin.createResourceHandler(any(Context.class), any(
+ CGroupsHandler.class), any(PrivilegedOperationExecutor.class)))
+ .thenReturn(new CustomizedResourceHandler());
+ plugins.put("resource2", resourcePlugin);
+ when(rpm.getNameToPlugins()).thenReturn(plugins);
+ return rpm;
+ }
+
+ @After
+ public void tearDown() {
+ if (nm != null) {
+ try {
+ ServiceOperations.stop(nm);
+ } catch (Throwable t) {
+ // ignore
+ }
+ }
+ }
+
+ private class CustomizedResourceHandler implements ResourceHandler {
+
+ @Override
+ public List<PrivilegedOperation> bootstrap(Configuration configuration)
+ throws ResourceHandlerException {
+ return null;
+ }
+
+ @Override
+ public List<PrivilegedOperation> preStart(Container container)
+ throws ResourceHandlerException {
+ return null;
+ }
+
+ @Override
+ public List<PrivilegedOperation> reacquireContainer(ContainerId containerId)
+ throws ResourceHandlerException {
+ return null;
+ }
+
+ @Override
+ public List<PrivilegedOperation> postComplete(ContainerId containerId)
+ throws ResourceHandlerException {
+ return null;
+ }
+
+ @Override
+ public List<PrivilegedOperation> teardown()
+ throws ResourceHandlerException {
+ return null;
+ }
+ }
+
+ private class MyMockNM extends NodeManager {
+ private final ResourcePluginManager rpm;
+
+ public MyMockNM(ResourcePluginManager rpm) {
+ this.rpm = rpm;
+ }
+
+ @Override
+ protected NodeStatusUpdater createNodeStatusUpdater(Context context,
+ Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
+ ((NodeManager.NMContext)context).setResourcePluginManager(rpm);
+ return new BaseNodeStatusUpdaterForTest(context, dispatcher, healthChecker,
+ metrics, new BaseResourceTrackerForTest());
+ }
+
+ @Override
+ protected ContainerManagerImpl createContainerManager(Context context,
+ ContainerExecutor exec, DeletionService del,
+ NodeStatusUpdater nodeStatusUpdater,
+ ApplicationACLsManager aclsManager,
+ LocalDirsHandlerService diskhandler) {
+ return new MyContainerManager(context, exec, del, nodeStatusUpdater,
+ metrics, diskhandler);
+ }
+
+ @Override
+ protected ResourcePluginManager createResourcePluginManager() {
+ return rpm;
+ }
+ }
+
+ public class MyLCE extends LinuxContainerExecutor {
+ private PrivilegedOperationExecutor poe = mock(PrivilegedOperationExecutor.class);
+
+ @Override
+ protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() {
+ return poe;
+ }
+ }
+
+ /*
+ * Make sure ResourcePluginManager is initialized during NM start up.
+ */
+ @Test(timeout = 30000)
+ public void testResourcePluginManagerInitialization() throws Exception {
+ final ResourcePluginManager rpm = stubResourcePluginmanager();
+ nm = new MyMockNM(rpm);
+
+ YarnConfiguration conf = createNMConfig();
+ nm.init(conf);
+ verify(rpm, times(1)).initialize(
+ any(Context.class));
+ }
+
+ /*
+ * Make sure ResourcePluginManager is invoked during NM update.
+ */
+ @Test(timeout = 30000)
+ public void testNodeStatusUpdaterWithResourcePluginsEnabled() throws Exception {
+ final ResourcePluginManager rpm = stubResourcePluginmanager();
+
+ nm = new MyMockNM(rpm);
+
+ YarnConfiguration conf = createNMConfig();
+ nm.init(conf);
+ nm.start();
+
+ NodeResourceUpdaterPlugin nodeResourceUpdaterPlugin =
+ rpm.getNameToPlugins().get("resource1")
+ .getNodeResourceHandlerInstance();
+
+ verify(nodeResourceUpdaterPlugin, times(1)).updateConfiguredResource(
+ any(Resource.class));
+ }
+
+ /*
+ * Make sure ResourcePluginManager is used to initialize ResourceHandlerChain
+ */
+ @Test(timeout = 30000)
+ public void testLinuxContainerExecutorWithResourcePluginsEnabled() throws Exception {
+ final ResourcePluginManager rpm = stubResourcePluginmanager();
+ final LinuxContainerExecutor lce = new MyLCE();
+
+ nm = new NodeManager() {
+ @Override
+ protected NodeStatusUpdater createNodeStatusUpdater(Context context,
+ Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
+ ((NMContext)context).setResourcePluginManager(rpm);
+ return new BaseNodeStatusUpdaterForTest(context, dispatcher, healthChecker,
+ metrics, new BaseResourceTrackerForTest());
+ }
+
+ @Override
+ protected ContainerManagerImpl createContainerManager(Context context,
+ ContainerExecutor exec, DeletionService del,
+ NodeStatusUpdater nodeStatusUpdater,
+ ApplicationACLsManager aclsManager,
+ LocalDirsHandlerService diskhandler) {
+ return new MyContainerManager(context, exec, del, nodeStatusUpdater,
+ metrics, diskhandler);
+ }
+
+ @Override
+ protected ContainerExecutor createContainerExecutor(Configuration conf) {
+ ((NMContext)this.getNMContext()).setResourcePluginManager(rpm);
+ lce.setConf(conf);
+ return lce;
+ }
+ };
+
+ YarnConfiguration conf = createNMConfig();
+
+ nm.init(conf);
+ nm.start();
+
+ ResourceHandler handler = lce.getResourceHandler();
+ Assert.assertNotNull(handler);
+ Assert.assertTrue(handler instanceof ResourceHandlerChain);
+
+ boolean newHandlerAdded = false;
+ for (ResourceHandler h : ((ResourceHandlerChain) handler)
+ .getResourceHandlerList()) {
+ if (h instanceof CustomizedResourceHandler) {
+ newHandlerAdded = true;
+ break;
+ }
+ }
+ Assert.assertTrue("New ResourceHandler should be added", newHandlerAdded);
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
new file mode 100644
index 0000000..83bace2
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
@@ -0,0 +1,123 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.List;
+
+public class TestGpuDiscoverer {
+ private String getTestParentFolder() {
+ File f = new File("target/temp/" + TestGpuDiscoverer.class.getName());
+ return f.getAbsolutePath();
+ }
+
+ private void touchFile(File f) throws IOException {
+ new FileOutputStream(f).close();
+ }
+
+ @Before
+ public void before() throws IOException {
+ String folder = getTestParentFolder();
+ File f = new File(folder);
+ FileUtils.deleteDirectory(f);
+ f.mkdirs();
+ }
+
+ @Test
+ public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception {
+ // Only run this on demand.
+ Assume.assumeTrue(Boolean.valueOf(
+ System.getProperty("RunLinuxGpuResourceDiscoverPluginConfigTest")));
+
+ // test case 1, check default setting.
+ Configuration conf = new Configuration(false);
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
+ plugin.getPathOfGpuBinary());
+ Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH"));
+ Assert.assertTrue(
+ plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
+
+ // test case 2, check mandatory set path.
+ File fakeBinary = new File(getTestParentFolder(),
+ GpuDiscoverer.DEFAULT_BINARY_NAME);
+ touchFile(fakeBinary);
+ conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder());
+ plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ Assert.assertEquals(fakeBinary.getAbsolutePath(),
+ plugin.getPathOfGpuBinary());
+ Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH"));
+
+ // test case 3, check mandatory set path, but binary doesn't exist so default
+ // path will be used.
+ fakeBinary.delete();
+ plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME,
+ plugin.getPathOfGpuBinary());
+ Assert.assertTrue(
+ plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia"));
+ }
+
+ @Test
+ public void testGpuDiscover() throws YarnException {
+ // Since this is more of a performance unit test, only run if
+ // RunUserLimitThroughput is set (-DRunUserLimitThroughput=true)
+ Assume.assumeTrue(
+ Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest")));
+ Configuration conf = new Configuration(false);
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+ GpuDeviceInformation info = plugin.getGpuDeviceInformation();
+
+ Assert.assertTrue(info.getGpus().size() > 0);
+ Assert.assertEquals(plugin.getMinorNumbersOfGpusUsableByYarn().size(),
+ info.getGpus().size());
+ }
+
+ @Test
+ public void getNumberOfUsableGpusFromConfig() throws YarnException {
+ Configuration conf = new Configuration(false);
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,2,4");
+ GpuDiscoverer plugin = new GpuDiscoverer();
+ plugin.initialize(conf);
+
+ List<Integer> minorNumbers = plugin.getMinorNumbersOfGpusUsableByYarn();
+ Assert.assertEquals(4, minorNumbers.size());
+
+ Assert.assertTrue(0 == minorNumbers.get(0));
+ Assert.assertTrue(1 == minorNumbers.get(1));
+ Assert.assertTrue(2 == minorNumbers.get(2));
+ Assert.assertTrue(4 == minorNumbers.get(3));
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java
new file mode 100644
index 0000000..e22597d
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.IOException;
+
+public class TestGpuDeviceInformationParser {
+ @Test
+ public void testParse() throws IOException, YarnException {
+ File f = new File("src/test/resources/nvidia-smi-sample-xml-output");
+ String s = FileUtils.readFileToString(f, "UTF-8");
+
+ GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
+
+ GpuDeviceInformation info = parser.parseXml(s);
+ Assert.assertEquals("375.66", info.getDriverVersion());
+ Assert.assertEquals(2, info.getGpus().size());
+ PerGpuDeviceInformation gpu1 = info.getGpus().get(1);
+ Assert.assertEquals("Tesla P100-PCIE-12GB", gpu1.getProductName());
+ Assert.assertEquals(16384, gpu1.getGpuMemoryUsage().getTotalMemoryMiB());
+ Assert.assertEquals(10.3f,
+ gpu1.getGpuUtilizations().getOverallGpuUtilization(), 1e-6);
+ Assert.assertEquals(34f, gpu1.getTemperature().getCurrentGpuTemp(), 1e-6);
+ Assert.assertEquals(85f, gpu1.getTemperature().getMaxGpuTemp(), 1e-6);
+ Assert.assertEquals(82f, gpu1.getTemperature().getSlowThresholdGpuTemp(),
+ 1e-6);
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-xml-output b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-xml-output
new file mode 100644
index 0000000..5ccb722
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-xml-output
@@ -0,0 +1,547 @@
+<?xml version="1.0" ?>
+<!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v8.dtd">
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<nvidia_smi_log>
+ <timestamp>Wed Sep 6 21:52:51 2017</timestamp>
+ <driver_version>375.66</driver_version>
+ <attached_gpus>2</attached_gpus>
+ <gpu id="0000:04:00.0">
+ <product_name>Tesla P100-PCIE-12GB</product_name>
+ <product_brand>Tesla</product_brand>
+ <display_mode>Disabled</display_mode>
+ <display_active>Disabled</display_active>
+ <persistence_mode>Disabled</persistence_mode>
+ <accounting_mode>Disabled</accounting_mode>
+ <accounting_mode_buffer_size>1920</accounting_mode_buffer_size>
+ <driver_model>
+ <current_dm>N/A</current_dm>
+ <pending_dm>N/A</pending_dm>
+ </driver_model>
+ <serial>0320717030197</serial>
+ <uuid>GPU-28604e81-21ec-cc48-6759-bf2648b22e16</uuid>
+ <minor_number>0</minor_number>
+ <vbios_version>86.00.3A.00.02</vbios_version>
+ <multigpu_board>No</multigpu_board>
+ <board_id>0x400</board_id>
+ <gpu_part_number>900-2H400-0110-030</gpu_part_number>
+ <inforom_version>
+ <img_version>H400.0202.00.01</img_version>
+ <oem_object>1.1</oem_object>
+ <ecc_object>4.1</ecc_object>
+ <pwr_object>N/A</pwr_object>
+ </inforom_version>
+ <gpu_operation_mode>
+ <current_gom>N/A</current_gom>
+ <pending_gom>N/A</pending_gom>
+ </gpu_operation_mode>
+ <gpu_virtualization_mode>
+ <virtualization_mode>None</virtualization_mode>
+ </gpu_virtualization_mode>
+ <pci>
+ <pci_bus>04</pci_bus>
+ <pci_device>00</pci_device>
+ <pci_domain>0000</pci_domain>
+ <pci_device_id>15F710DE</pci_device_id>
+ <pci_bus_id>0000:04:00.0</pci_bus_id>
+ <pci_sub_system_id>11DA10DE</pci_sub_system_id>
+ <pci_gpu_link_info>
+ <pcie_gen>
+ <max_link_gen>3</max_link_gen>
+ <current_link_gen>3</current_link_gen>
+ </pcie_gen>
+ <link_widths>
+ <max_link_width>16x</max_link_width>
+ <current_link_width>16x</current_link_width>
+ </link_widths>
+ </pci_gpu_link_info>
+ <pci_bridge_chip>
+ <bridge_chip_type>N/A</bridge_chip_type>
+ <bridge_chip_fw>N/A</bridge_chip_fw>
+ </pci_bridge_chip>
+ <replay_counter>0</replay_counter>
+ <tx_util>0 KB/s</tx_util>
+ <rx_util>0 KB/s</rx_util>
+ </pci>
+ <fan_speed>N/A</fan_speed>
+ <performance_state>P0</performance_state>
+ <clocks_throttle_reasons>
+ <clocks_throttle_reason_gpu_idle>Active</clocks_throttle_reason_gpu_idle>
+ <clocks_throttle_reason_applications_clocks_setting>Not Active</clocks_throttle_reason_applications_clocks_setting>
+ <clocks_throttle_reason_sw_power_cap>Not Active</clocks_throttle_reason_sw_power_cap>
+ <clocks_throttle_reason_hw_slowdown>Not Active</clocks_throttle_reason_hw_slowdown>
+ <clocks_throttle_reason_sync_boost>Not Active</clocks_throttle_reason_sync_boost>
+ <clocks_throttle_reason_unknown>Not Active</clocks_throttle_reason_unknown>
+ </clocks_throttle_reasons>
+ <fb_memory_usage>
+ <total>12193 MiB</total>
+ <used>0 MiB</used>
+ <free>12193 MiB</free>
+ </fb_memory_usage>
+ <bar1_memory_usage>
+ <total>16384 MiB</total>
+ <used>2 MiB</used>
+ <free>16382 MiB</free>
+ </bar1_memory_usage>
+ <compute_mode>Default</compute_mode>
+ <utilization>
+ <gpu_util>0 %</gpu_util>
+ <memory_util>0 %</memory_util>
+ <encoder_util>0 %</encoder_util>
+ <decoder_util>0 %</decoder_util>
+ </utilization>
+ <encoder_stats>
+ <session_count>0</session_count>
+ <average_fps>0</average_fps>
+ <average_latency>0 ms</average_latency>
+ </encoder_stats>
+ <ecc_mode>
+ <current_ecc>Enabled</current_ecc>
+ <pending_ecc>Enabled</pending_ecc>
+ </ecc_mode>
+ <ecc_errors>
+ <volatile>
+ <single_bit>
+ <device_memory>0</device_memory>
+ <register_file>0</register_file>
+ <l1_cache>N/A</l1_cache>
+ <l2_cache>0</l2_cache>
+ <texture_memory>0</texture_memory>
+ <texture_shm>0</texture_shm>
+ <total>0</total>
+ </single_bit>
+ <double_bit>
+ <device_memory>0</device_memory>
+ <register_file>0</register_file>
+ <l1_cache>N/A</l1_cache>
+ <l2_cache>0</l2_cache>
+ <texture_memory>0</texture_memory>
+ <texture_shm>0</texture_shm>
+ <total>0</total>
+ </double_bit>
+ </volatile>
+ <aggregate>
+ <single_bit>
+ <device_memory>0</device_memory>
+ <register_file>0</register_file>
+ <l1_cache>N/A</l1_cache>
+ <l2_cache>0</l2_cache>
+ <texture_memory>0</texture_memory>
+ <texture_shm>0</texture_shm>
+ <total>0</total>
+ </single_bit>
+ <double_bit>
+ <device_memory>0</device_memory>
+ <register_file>0</register_file>
+ <l1_cache>N/A</l1_cache>
+ <l2_cache>0</l2_cache>
+ <texture_memory>0</texture_memory>
+ <texture_shm>0</texture_shm>
+ <total>0</total>
+ </double_bit>
+ </aggregate>
+ </ecc_errors>
+ <retired_pages>
+ <multiple_single_bit_retirement>
+ <retired_count>0</retired_count>
+ <retired_page_addresses>
+ </retired_page_addresses>
+ </multiple_single_bit_retirement>
+ <double_bit_retirement>
+ <retired_count>0</retired_count>
+ <retired_page_addresses>
+ </retired_page_addresses>
+ </double_bit_retirement>
+ <pending_retirement>No</pending_retirement>
+ </retired_pages>
+ <temperature>
+ <gpu_temp>31 C</gpu_temp>
+ <gpu_temp_max_threshold>85 C</gpu_temp_max_threshold>
+ <gpu_temp_slow_threshold>82 C</gpu_temp_slow_threshold>
+ </temperature>
+ <power_readings>
+ <power_state>P0</power_state>
+ <power_management>Supported</power_management>
+ <power_draw>24.84 W</power_draw>
+ <power_limit>250.00 W</power_limit>
+ <default_power_limit>250.00 W</default_power_limit>
+ <enforced_power_limit>250.00 W</enforced_power_limit>
+ <min_power_limit>125.00 W</min_power_limit>
+ <max_power_limit>250.00 W</max_power_limit>
+ </power_readings>
+ <clocks>
+ <graphics_clock>405 MHz</graphics_clock>
+ <sm_clock>405 MHz</sm_clock>
+ <mem_clock>715 MHz</mem_clock>
+ <video_clock>835 MHz</video_clock>
+ </clocks>
+ <applications_clocks>
+ <graphics_clock>1189 MHz</graphics_clock>
+ <mem_clock>715 MHz</mem_clock>
+ </applications_clocks>
+ <default_applications_clocks>
+ <graphics_clock>1189 MHz</graphics_clock>
+ <mem_clock>715 MHz</mem_clock>
+ </default_applications_clocks>
+ <max_clocks>
+ <graphics_clock>1328 MHz</graphics_clock>
+ <sm_clock>1328 MHz</sm_clock>
+ <mem_clock>715 MHz</mem_clock>
+ <video_clock>1328 MHz</video_clock>
+ </max_clocks>
+ <clock_policy>
+ <auto_boost>N/A</auto_boost>
+ <auto_boost_default>N/A</auto_boost_default>
+ </clock_policy>
+ <supported_clocks>
+ <supported_mem_clock>
+ <value>715 MHz</value>
+ <supported_graphics_clock>1328 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1316 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1303 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1290 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1278 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1265 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1252 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1240 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1227 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1215 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1202 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1189 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1177 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1164 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1151 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1139 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1126 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1113 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1101 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1088 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1075 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1063 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1050 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1037 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1025 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1012 MHz</supported_graphics_clock>
+ <supported_graphics_clock>999 MHz</supported_graphics_clock>
+ <supported_graphics_clock>987 MHz</supported_graphics_clock>
+ <supported_graphics_clock>974 MHz</supported_graphics_clock>
+ <supported_graphics_clock>961 MHz</supported_graphics_clock>
+ <supported_graphics_clock>949 MHz</supported_graphics_clock>
+ <supported_graphics_clock>936 MHz</supported_graphics_clock>
+ <supported_graphics_clock>923 MHz</supported_graphics_clock>
+ <supported_graphics_clock>911 MHz</supported_graphics_clock>
+ <supported_graphics_clock>898 MHz</supported_graphics_clock>
+ <supported_graphics_clock>885 MHz</supported_graphics_clock>
+ <supported_graphics_clock>873 MHz</supported_graphics_clock>
+ <supported_graphics_clock>860 MHz</supported_graphics_clock>
+ <supported_graphics_clock>847 MHz</supported_graphics_clock>
+ <supported_graphics_clock>835 MHz</supported_graphics_clock>
+ <supported_graphics_clock>822 MHz</supported_graphics_clock>
+ <supported_graphics_clock>810 MHz</supported_graphics_clock>
+ <supported_graphics_clock>797 MHz</supported_graphics_clock>
+ <supported_graphics_clock>784 MHz</supported_graphics_clock>
+ <supported_graphics_clock>772 MHz</supported_graphics_clock>
+ <supported_graphics_clock>759 MHz</supported_graphics_clock>
+ <supported_graphics_clock>746 MHz</supported_graphics_clock>
+ <supported_graphics_clock>734 MHz</supported_graphics_clock>
+ <supported_graphics_clock>721 MHz</supported_graphics_clock>
+ <supported_graphics_clock>708 MHz</supported_graphics_clock>
+ <supported_graphics_clock>696 MHz</supported_graphics_clock>
+ <supported_graphics_clock>683 MHz</supported_graphics_clock>
+ <supported_graphics_clock>670 MHz</supported_graphics_clock>
+ <supported_graphics_clock>658 MHz</supported_graphics_clock>
+ <supported_graphics_clock>645 MHz</supported_graphics_clock>
+ <supported_graphics_clock>632 MHz</supported_graphics_clock>
+ <supported_graphics_clock>620 MHz</supported_graphics_clock>
+ <supported_graphics_clock>607 MHz</supported_graphics_clock>
+ <supported_graphics_clock>594 MHz</supported_graphics_clock>
+ <supported_graphics_clock>582 MHz</supported_graphics_clock>
+ <supported_graphics_clock>569 MHz</supported_graphics_clock>
+ <supported_graphics_clock>556 MHz</supported_graphics_clock>
+ <supported_graphics_clock>544 MHz</supported_graphics_clock>
+ </supported_mem_clock>
+ </supported_clocks>
+ <processes>
+ </processes>
+ <accounted_processes>
+ </accounted_processes>
+ </gpu>
+
+ <gpu id="0000:82:00.0">
+ <product_name>Tesla P100-PCIE-12GB</product_name>
+ <product_brand>Tesla</product_brand>
+ <display_mode>Disabled</display_mode>
+ <display_active>Disabled</display_active>
+ <persistence_mode>Disabled</persistence_mode>
+ <accounting_mode>Disabled</accounting_mode>
+ <accounting_mode_buffer_size>1920</accounting_mode_buffer_size>
+ <driver_model>
+ <current_dm>N/A</current_dm>
+ <pending_dm>N/A</pending_dm>
+ </driver_model>
+ <serial>0320717031755</serial>
+ <uuid>GPU-46915a82-3fd2-8e11-ae26-a80b607c04f3</uuid>
+ <minor_number>1</minor_number>
+ <vbios_version>86.00.3A.00.02</vbios_version>
+ <multigpu_board>No</multigpu_board>
+ <board_id>0x8200</board_id>
+ <gpu_part_number>900-2H400-0110-030</gpu_part_number>
+ <inforom_version>
+ <img_version>H400.0202.00.01</img_version>
+ <oem_object>1.1</oem_object>
+ <ecc_object>4.1</ecc_object>
+ <pwr_object>N/A</pwr_object>
+ </inforom_version>
+ <gpu_operation_mode>
+ <current_gom>N/A</current_gom>
+ <pending_gom>N/A</pending_gom>
+ </gpu_operation_mode>
+ <gpu_virtualization_mode>
+ <virtualization_mode>None</virtualization_mode>
+ </gpu_virtualization_mode>
+ <pci>
+ <pci_bus>82</pci_bus>
+ <pci_device>00</pci_device>
+ <pci_domain>0000</pci_domain>
+ <pci_device_id>15F710DE</pci_device_id>
+ <pci_bus_id>0000:82:00.0</pci_bus_id>
+ <pci_sub_system_id>11DA10DE</pci_sub_system_id>
+ <pci_gpu_link_info>
+ <pcie_gen>
+ <max_link_gen>3</max_link_gen>
+ <current_link_gen>3</current_link_gen>
+ </pcie_gen>
+ <link_widths>
+ <max_link_width>16x</max_link_width>
+ <current_link_width>16x</current_link_width>
+ </link_widths>
+ </pci_gpu_link_info>
+ <pci_bridge_chip>
+ <bridge_chip_type>N/A</bridge_chip_type>
+ <bridge_chip_fw>N/A</bridge_chip_fw>
+ </pci_bridge_chip>
+ <replay_counter>0</replay_counter>
+ <tx_util>0 KB/s</tx_util>
+ <rx_util>0 KB/s</rx_util>
+ </pci>
+ <fan_speed>N/A</fan_speed>
+ <performance_state>P0</performance_state>
+ <clocks_throttle_reasons>
+ <clocks_throttle_reason_gpu_idle>Active</clocks_throttle_reason_gpu_idle>
+ <clocks_throttle_reason_applications_clocks_setting>Not Active</clocks_throttle_reason_applications_clocks_setting>
+ <clocks_throttle_reason_sw_power_cap>Not Active</clocks_throttle_reason_sw_power_cap>
+ <clocks_throttle_reason_hw_slowdown>Not Active</clocks_throttle_reason_hw_slowdown>
+ <clocks_throttle_reason_sync_boost>Not Active</clocks_throttle_reason_sync_boost>
+ <clocks_throttle_reason_unknown>Not Active</clocks_throttle_reason_unknown>
+ </clocks_throttle_reasons>
+ <fb_memory_usage>
+ <total>12193 MiB</total>
+ <used>0 MiB</used>
+ <free>12193 MiB</free>
+ </fb_memory_usage>
+ <bar1_memory_usage>
+ <total>16384 MiB</total>
+ <used>2 MiB</used>
+ <free>16382 MiB</free>
+ </bar1_memory_usage>
+ <compute_mode>Default</compute_mode>
+ <utilization>
+ <gpu_util>10.3 %</gpu_util>
+ <memory_util>0 %</memory_util>
+ <encoder_util>0 %</encoder_util>
+ <decoder_util>0 %</decoder_util>
+ </utilization>
+ <encoder_stats>
+ <session_count>0</session_count>
+ <average_fps>0</average_fps>
+ <average_latency>0 ms</average_latency>
+ </encoder_stats>
+ <ecc_mode>
+ <current_ecc>Enabled</current_ecc>
+ <pending_ecc>Enabled</pending_ecc>
+ </ecc_mode>
+ <ecc_errors>
+ <volatile>
+ <single_bit>
+ <device_memory>0</device_memory>
+ <register_file>0</register_file>
+ <l1_cache>N/A</l1_cache>
+ <l2_cache>0</l2_cache>
+ <texture_memory>0</texture_memory>
+ <texture_shm>0</texture_shm>
+ <total>0</total>
+ </single_bit>
+ <double_bit>
+ <device_memory>0</device_memory>
+ <register_file>0</register_file>
+ <l1_cache>N/A</l1_cache>
+ <l2_cache>0</l2_cache>
+ <texture_memory>0</texture_memory>
+ <texture_shm>0</texture_shm>
+ <total>0</total>
+ </double_bit>
+ </volatile>
+ <aggregate>
+ <single_bit>
+ <device_memory>0</device_memory>
+ <register_file>0</register_file>
+ <l1_cache>N/A</l1_cache>
+ <l2_cache>0</l2_cache>
+ <texture_memory>0</texture_memory>
+ <texture_shm>0</texture_shm>
+ <total>0</total>
+ </single_bit>
+ <double_bit>
+ <device_memory>0</device_memory>
+ <register_file>0</register_file>
+ <l1_cache>N/A</l1_cache>
+ <l2_cache>0</l2_cache>
+ <texture_memory>0</texture_memory>
+ <texture_shm>0</texture_shm>
+ <total>0</total>
+ </double_bit>
+ </aggregate>
+ </ecc_errors>
+ <retired_pages>
+ <multiple_single_bit_retirement>
+ <retired_count>0</retired_count>
+ <retired_page_addresses>
+ </retired_page_addresses>
+ </multiple_single_bit_retirement>
+ <double_bit_retirement>
+ <retired_count>0</retired_count>
+ <retired_page_addresses>
+ </retired_page_addresses>
+ </double_bit_retirement>
+ <pending_retirement>No</pending_retirement>
+ </retired_pages>
+ <temperature>
+ <gpu_temp>34 C</gpu_temp>
+ <gpu_temp_max_threshold>85 C</gpu_temp_max_threshold>
+ <gpu_temp_slow_threshold>82 C</gpu_temp_slow_threshold>
+ </temperature>
+ <power_readings>
+ <power_state>P0</power_state>
+ <power_management>Supported</power_management>
+ <power_draw>25.54 W</power_draw>
+ <power_limit>250.00 W</power_limit>
+ <default_power_limit>250.00 W</default_power_limit>
+ <enforced_power_limit>250.00 W</enforced_power_limit>
+ <min_power_limit>125.00 W</min_power_limit>
+ <max_power_limit>250.00 W</max_power_limit>
+ </power_readings>
+ <clocks>
+ <graphics_clock>405 MHz</graphics_clock>
+ <sm_clock>405 MHz</sm_clock>
+ <mem_clock>715 MHz</mem_clock>
+ <video_clock>835 MHz</video_clock>
+ </clocks>
+ <applications_clocks>
+ <graphics_clock>1189 MHz</graphics_clock>
+ <mem_clock>715 MHz</mem_clock>
+ </applications_clocks>
+ <default_applications_clocks>
+ <graphics_clock>1189 MHz</graphics_clock>
+ <mem_clock>715 MHz</mem_clock>
+ </default_applications_clocks>
+ <max_clocks>
+ <graphics_clock>1328 MHz</graphics_clock>
+ <sm_clock>1328 MHz</sm_clock>
+ <mem_clock>715 MHz</mem_clock>
+ <video_clock>1328 MHz</video_clock>
+ </max_clocks>
+ <clock_policy>
+ <auto_boost>N/A</auto_boost>
+ <auto_boost_default>N/A</auto_boost_default>
+ </clock_policy>
+ <supported_clocks>
+ <supported_mem_clock>
+ <value>715 MHz</value>
+ <supported_graphics_clock>1328 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1316 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1303 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1290 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1278 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1265 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1252 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1240 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1227 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1215 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1202 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1189 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1177 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1164 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1151 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1139 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1126 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1113 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1101 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1088 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1075 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1063 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1050 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1037 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1025 MHz</supported_graphics_clock>
+ <supported_graphics_clock>1012 MHz</supported_graphics_clock>
+ <supported_graphics_clock>999 MHz</supported_graphics_clock>
+ <supported_graphics_clock>987 MHz</supported_graphics_clock>
+ <supported_graphics_clock>974 MHz</supported_graphics_clock>
+ <supported_graphics_clock>961 MHz</supported_graphics_clock>
+ <supported_graphics_clock>949 MHz</supported_graphics_clock>
+ <supported_graphics_clock>936 MHz</supported_graphics_clock>
+ <supported_graphics_clock>923 MHz</supported_graphics_clock>
+ <supported_graphics_clock>911 MHz</supported_graphics_clock>
+ <supported_graphics_clock>898 MHz</supported_graphics_clock>
+ <supported_graphics_clock>885 MHz</supported_graphics_clock>
+ <supported_graphics_clock>873 MHz</supported_graphics_clock>
+ <supported_graphics_clock>860 MHz</supported_graphics_clock>
+ <supported_graphics_clock>847 MHz</supported_graphics_clock>
+ <supported_graphics_clock>835 MHz</supported_graphics_clock>
+ <supported_graphics_clock>822 MHz</supported_graphics_clock>
+ <supported_graphics_clock>810 MHz</supported_graphics_clock>
+ <supported_graphics_clock>797 MHz</supported_graphics_clock>
+ <supported_graphics_clock>784 MHz</supported_graphics_clock>
+ <supported_graphics_clock>772 MHz</supported_graphics_clock>
+ <supported_graphics_clock>759 MHz</supported_graphics_clock>
+ <supported_graphics_clock>746 MHz</supported_graphics_clock>
+ <supported_graphics_clock>734 MHz</supported_graphics_clock>
+ <supported_graphics_clock>721 MHz</supported_graphics_clock>
+ <supported_graphics_clock>708 MHz</supported_graphics_clock>
+ <supported_graphics_clock>696 MHz</supported_graphics_clock>
+ <supported_graphics_clock>683 MHz</supported_graphics_clock>
+ <supported_graphics_clock>670 MHz</supported_graphics_clock>
+ <supported_graphics_clock>658 MHz</supported_graphics_clock>
+ <supported_graphics_clock>645 MHz</supported_graphics_clock>
+ <supported_graphics_clock>632 MHz</supported_graphics_clock>
+ <supported_graphics_clock>620 MHz</supported_graphics_clock>
+ <supported_graphics_clock>607 MHz</supported_graphics_clock>
+ <supported_graphics_clock>594 MHz</supported_graphics_clock>
+ <supported_graphics_clock>582 MHz</supported_graphics_clock>
+ <supported_graphics_clock>569 MHz</supported_graphics_clock>
+ <supported_graphics_clock>556 MHz</supported_graphics_clock>
+ <supported_graphics_clock>544 MHz</supported_graphics_clock>
+ </supported_mem_clock>
+ </supported_clocks>
+ <processes>
+ </processes>
+ <accounted_processes>
+ </accounted_processes>
+ </gpu>
+
+</nvidia_smi_log>
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org