You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by jh...@apache.org on 2018/09/06 17:14:03 UTC
[43/51] [abbrv] hadoop git commit: YARN-7137. [YARN-3926] Move newly
added APIs to unstable in YARN-3926 branch. Contributed by Wangda Tan.
YARN-7137. [YARN-3926] Move newly added APIs to unstable in YARN-3926 branch. Contributed by Wangda Tan.
(cherry picked from commit da0b6a354bf6f6bf37ca5a05a4a8eece09aa4893)
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/51222908
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/51222908
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/51222908
Branch: refs/heads/YARN-8200
Commit: 51222908a05d85138615aee8413843d59b727d73
Parents: 362ddc9
Author: Sunil G <su...@apache.org>
Authored: Tue Sep 12 20:31:47 2017 +0530
Committer: Jonathan Hung <jh...@linkedin.com>
Committed: Thu Sep 6 10:10:18 2018 -0700
----------------------------------------------------------------------
.../hadoop/yarn/api/records/Resource.java | 24 +-
.../yarn/api/records/ResourceInformation.java | 12 +-
.../yarn/api/records/ResourceRequest.java | 1 +
.../hadoop/yarn/conf/YarnConfiguration.java | 33 ++
.../yarn/util/resource/ResourceUtils.java | 70 +--
.../hadoop/yarn/util/resource/package-info.java | 6 +-
.../src/main/resources/yarn-default.xml | 48 +-
.../yarn/util/resource/TestResourceUtils.java | 17 +
.../server/nodemanager/ContainerExecutor.java | 3 +-
.../hadoop/yarn/server/nodemanager/Context.java | 3 +
.../nodemanager/DefaultContainerExecutor.java | 2 +-
.../nodemanager/LinuxContainerExecutor.java | 10 +-
.../yarn/server/nodemanager/NodeManager.java | 92 ++--
.../nodemanager/NodeStatusUpdaterImpl.java | 38 +-
.../linux/privileged/PrivilegedOperation.java | 1 +
.../linux/resources/ResourceHandlerChain.java | 4 +-
.../linux/resources/ResourceHandlerModule.java | 42 +-
.../resources/gpu/GpuResourceAllocator.java | 242 ++++++++
.../resources/gpu/GpuResourceHandlerImpl.java | 153 ++++++
.../NodeResourceUpdaterPlugin.java | 52 ++
.../resourceplugin/ResourcePlugin.java | 83 +++
.../resourceplugin/ResourcePluginManager.java | 106 ++++
.../resourceplugin/gpu/GpuDiscoverer.java | 254 +++++++++
.../gpu/GpuNodeResourceUpdateHandler.java | 66 +++
.../resourceplugin/gpu/GpuResourcePlugin.java | 61 +++
.../webapp/dao/gpu/GpuDeviceInformation.java | 72 +++
.../dao/gpu/GpuDeviceInformationParser.java | 87 +++
.../webapp/dao/gpu/PerGpuDeviceInformation.java | 165 ++++++
.../webapp/dao/gpu/PerGpuMemoryUsage.java | 58 ++
.../webapp/dao/gpu/PerGpuTemperature.java | 80 +++
.../webapp/dao/gpu/PerGpuUtilizations.java | 50 ++
.../server/nodemanager/NodeManagerTestBase.java | 164 ++++++
.../TestDefaultContainerExecutor.java | 4 +-
.../nodemanager/TestLinuxContainerExecutor.java | 2 +-
.../TestLinuxContainerExecutorWithMocks.java | 2 +-
.../server/nodemanager/TestNodeManager.java | 2 +-
.../nodemanager/TestNodeStatusUpdater.java | 100 +---
.../amrmproxy/BaseAMRMProxyTest.java | 46 +-
.../resources/TestResourceHandlerModule.java | 8 +-
.../resources/gpu/TestGpuResourceHandler.java | 382 +++++++++++++
.../TestContainersMonitorResourceChange.java | 2 +-
.../TestResourcePluginManager.java | 261 +++++++++
.../resourceplugin/gpu/TestGpuDiscoverer.java | 123 +++++
.../dao/gpu/TestGpuDeviceInformationParser.java | 50 ++
.../test/resources/nvidia-smi-sample-xml-output | 547 +++++++++++++++++++
.../resourcemanager/webapp/dao/AppInfo.java | 2 +-
46 files changed, 3385 insertions(+), 245 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Resource.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Resource.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Resource.java
index 37b50f2..9a5bc79 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Resource.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Resource.java
@@ -206,8 +206,8 @@ public abstract class Resource implements Comparable<Resource> {
*
* @return Map of resource name to ResourceInformation
*/
- @Public
- @Evolving
+ @InterfaceAudience.Private
+ @InterfaceStability.Unstable
public ResourceInformation[] getResources() {
return resources;
}
@@ -220,7 +220,7 @@ public abstract class Resource implements Comparable<Resource> {
* @throws ResourceNotFoundException if the resource can't be found
*/
@Public
- @Evolving
+ @InterfaceStability.Unstable
public ResourceInformation getResourceInformation(String resource)
throws ResourceNotFoundException {
Integer index = ResourceUtils.getResourceTypeIndex().get(resource);
@@ -240,8 +240,8 @@ public abstract class Resource implements Comparable<Resource> {
* @throws ResourceNotFoundException
* if the resource can't be found
*/
- @Public
- @Evolving
+ @InterfaceAudience.Private
+ @InterfaceStability.Unstable
public ResourceInformation getResourceInformation(int index)
throws ResourceNotFoundException {
ResourceInformation ri = null;
@@ -262,7 +262,7 @@ public abstract class Resource implements Comparable<Resource> {
* @throws ResourceNotFoundException if the resource can't be found
*/
@Public
- @Evolving
+ @InterfaceStability.Unstable
public long getResourceValue(String resource)
throws ResourceNotFoundException {
return getResourceInformation(resource).getValue();
@@ -276,7 +276,7 @@ public abstract class Resource implements Comparable<Resource> {
* @throws ResourceNotFoundException if the resource is not found
*/
@Public
- @Evolving
+ @InterfaceStability.Unstable
public void setResourceInformation(String resource,
ResourceInformation resourceInformation)
throws ResourceNotFoundException {
@@ -302,8 +302,8 @@ public abstract class Resource implements Comparable<Resource> {
* @throws ResourceNotFoundException
* if the resource is not found
*/
- @Public
- @Evolving
+ @InterfaceAudience.Private
+ @InterfaceStability.Unstable
public void setResourceInformation(int index,
ResourceInformation resourceInformation)
throws ResourceNotFoundException {
@@ -323,7 +323,7 @@ public abstract class Resource implements Comparable<Resource> {
* @throws ResourceNotFoundException if the resource is not found
*/
@Public
- @Evolving
+ @InterfaceStability.Unstable
public void setResourceValue(String resource, long value)
throws ResourceNotFoundException {
if (resource.equals(ResourceInformation.MEMORY_URI)) {
@@ -350,8 +350,8 @@ public abstract class Resource implements Comparable<Resource> {
* @throws ResourceNotFoundException
* if the resource is not found
*/
- @Public
- @Evolving
+ @InterfaceAudience.Private
+ @InterfaceStability.Unstable
public void setResourceValue(int index, long value)
throws ResourceNotFoundException {
try {
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java
index 785b311..84b4748 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java
@@ -18,11 +18,14 @@
package org.apache.hadoop.yarn.api.records;
-import org.apache.curator.shaded.com.google.common.reflect.ClassPath;
+import com.google.common.collect.ImmutableMap;
+
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes;
import org.apache.hadoop.yarn.util.UnitsConversionUtil;
+import java.util.Map;
+
/**
* Class to encapsulate information about a Resource - the name of the resource,
* the units(milli, micro, etc), the type(countable), and the value.
@@ -36,13 +39,20 @@ public class ResourceInformation implements Comparable<ResourceInformation> {
private long minimumAllocation;
private long maximumAllocation;
+ // Known resource types
public static final String MEMORY_URI = "memory-mb";
public static final String VCORES_URI = "vcores";
+ public static final String GPU_URI = "yarn.io/gpu";
public static final ResourceInformation MEMORY_MB =
ResourceInformation.newInstance(MEMORY_URI, "Mi");
public static final ResourceInformation VCORES =
ResourceInformation.newInstance(VCORES_URI);
+ public static final ResourceInformation GPUS =
+ ResourceInformation.newInstance(GPU_URI);
+
+ public static final Map<String, ResourceInformation> MANDATORY_RESOURCES =
+ ImmutableMap.of(MEMORY_URI, MEMORY_MB, VCORES_URI, VCORES, GPU_URI, GPUS);
/**
* Get the name for the resource.
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceRequest.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceRequest.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceRequest.java
index 94eda7c..e1a98ae 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceRequest.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceRequest.java
@@ -21,6 +21,7 @@ package org.apache.hadoop.yarn.api.records;
import java.io.Serializable;
import org.apache.hadoop.classification.InterfaceAudience.Public;
+import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.classification.InterfaceStability.Evolving;
import org.apache.hadoop.classification.InterfaceStability.Stable;
import org.apache.hadoop.classification.InterfaceStability.Unstable;
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index ec1869f..63e46d9 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -1411,6 +1411,39 @@ public class YarnConfiguration extends Configuration {
public static final String NM_NETWORK_RESOURCE_OUTBOUND_BANDWIDTH_YARN_MBIT =
NM_NETWORK_RESOURCE_PREFIX + "outbound-bandwidth-yarn-mbit";
+ /**
+ * Prefix for computation resources, example of computation resources like
+ * GPU / FPGA / TPU, etc.
+ */
+ @Private
+ public static final String NM_RESOURCE_PLUGINS =
+ NM_PREFIX + "resource-plugins";
+
+ /**
+ * Prefix for gpu configurations. Work in progress: This configuration
+ * parameter may be changed/removed in the future.
+ */
+ @Private
+ public static final String NM_GPU_RESOURCE_PREFIX =
+ NM_RESOURCE_PLUGINS + ".gpu.";
+
+ @Private
+ public static final String NM_GPU_ALLOWED_DEVICES =
+ NM_GPU_RESOURCE_PREFIX + "allowed-gpu-devices";
+ @Private
+ public static final String AUTOMATICALLY_DISCOVER_GPU_DEVICES = "auto";
+
+ /**
+ * This setting controls where to how to invoke GPU binaries
+ */
+ @Private
+ public static final String NM_GPU_PATH_TO_EXEC =
+ NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables";
+
+ @Private
+ public static final String DEFAULT_NM_GPU_PATH_TO_EXEC = "";
+
+
/** NM Webapp address.**/
public static final String NM_WEBAPP_ADDRESS = NM_PREFIX + "webapp.address";
public static final int DEFAULT_NM_WEBAPP_PORT = 8042;
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java
index 110453a..f3edc74 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java
@@ -46,11 +46,11 @@ import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
+import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
+
/**
* Helper class to read the resource-types to be supported by the system.
*/
-@InterfaceAudience.Public
-@InterfaceStability.Unstable
public class ResourceUtils {
public static final String UNITS = ".units";
@@ -65,7 +65,6 @@ public class ResourceUtils {
private static final Map<String, Integer> RESOURCE_NAME_TO_INDEX =
new ConcurrentHashMap<String, Integer>();
private static volatile Map<String, ResourceInformation> resourceTypes;
- private static volatile String[] resourceNamesArray;
private static volatile ResourceInformation[] resourceTypesArray;
private static volatile boolean initializedNodeResources = false;
private static volatile Map<String, ResourceInformation> readOnlyNodeResources;
@@ -85,33 +84,32 @@ public class ResourceUtils {
*/
String key = "memory";
if (resourceInformationMap.containsKey(key)) {
- LOG.warn("Attempt to define resource '" + key +
- "', but it is not allowed.");
- throw new YarnRuntimeException("Attempt to re-define mandatory resource '"
- + key + "'.");
- }
-
- if (resourceInformationMap.containsKey(MEMORY)) {
- ResourceInformation memInfo = resourceInformationMap.get(MEMORY);
- String memUnits = ResourceInformation.MEMORY_MB.getUnits();
- ResourceTypes memType = ResourceInformation.MEMORY_MB.getResourceType();
- if (!memInfo.getUnits().equals(memUnits) || !memInfo.getResourceType()
- .equals(memType)) {
- throw new YarnRuntimeException(
- "Attempt to re-define mandatory resource 'memory-mb'. It can only"
- + " be of type 'COUNTABLE' and have units 'Mi'.");
- }
+ LOG.warn(
+ "Attempt to define resource '" + key + "', but it is not allowed.");
+ throw new YarnRuntimeException(
+ "Attempt to re-define mandatory resource '" + key + "'.");
}
- if (resourceInformationMap.containsKey(VCORES)) {
- ResourceInformation vcoreInfo = resourceInformationMap.get(VCORES);
- String vcoreUnits = ResourceInformation.VCORES.getUnits();
- ResourceTypes vcoreType = ResourceInformation.VCORES.getResourceType();
- if (!vcoreInfo.getUnits().equals(vcoreUnits) || !vcoreInfo
- .getResourceType().equals(vcoreType)) {
- throw new YarnRuntimeException(
- "Attempt to re-define mandatory resource 'vcores'. It can only be"
- + " of type 'COUNTABLE' and have units ''(no units).");
+ for (Map.Entry<String, ResourceInformation> mandatoryResourceEntry :
+ ResourceInformation.MANDATORY_RESOURCES.entrySet()) {
+ key = mandatoryResourceEntry.getKey();
+ ResourceInformation mandatoryRI = mandatoryResourceEntry.getValue();
+
+ ResourceInformation newDefinedRI = resourceInformationMap.get(key);
+ if (newDefinedRI != null) {
+ String expectedUnit = mandatoryRI.getUnits();
+ ResourceTypes expectedType = mandatoryRI.getResourceType();
+ String actualUnit = newDefinedRI.getUnits();
+ ResourceTypes actualType = newDefinedRI.getResourceType();
+
+ if (!expectedUnit.equals(actualUnit) || !expectedType.equals(
+ actualType)) {
+ throw new YarnRuntimeException("Defined mandatory resource type="
+ + key + " inside resource-types.xml, however its type or "
+ + "unit is conflict to mandatory resource types, expected type="
+ + expectedType + ", unit=" + expectedUnit + "; actual type="
+ + actualType + " actual unit=" + actualUnit);
+ }
}
}
}
@@ -270,7 +268,6 @@ public class ResourceUtils {
private static void updateKnownResources() {
// Update resource names.
- resourceNamesArray = new String[resourceTypes.size()];
resourceTypesArray = new ResourceInformation[resourceTypes.size()];
int index = 2;
@@ -278,14 +275,11 @@ public class ResourceUtils {
if (resInfo.getName().equals(MEMORY)) {
resourceTypesArray[0] = ResourceInformation
.newInstance(resourceTypes.get(MEMORY));
- resourceNamesArray[0] = MEMORY;
} else if (resInfo.getName().equals(VCORES)) {
resourceTypesArray[1] = ResourceInformation
.newInstance(resourceTypes.get(VCORES));
- resourceNamesArray[1] = VCORES;
} else {
resourceTypesArray[index] = ResourceInformation.newInstance(resInfo);
- resourceNamesArray[index] = resInfo.getName();
index++;
}
}
@@ -319,18 +313,6 @@ public class ResourceUtils {
YarnConfiguration.RESOURCE_TYPES_CONFIGURATION_FILE);
}
- /**
- * Get resource names array, this is mostly for performance perspective. Never
- * modify returned array.
- *
- * @return resourceNamesArray
- */
- public static String[] getResourceNamesArray() {
- initializeResourceTypesIfNeeded(null,
- YarnConfiguration.RESOURCE_TYPES_CONFIGURATION_FILE);
- return resourceNamesArray;
- }
-
public static ResourceInformation[] getResourceTypesArray() {
initializeResourceTypesIfNeeded(null,
YarnConfiguration.RESOURCE_TYPES_CONFIGURATION_FILE);
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/package-info.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/package-info.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/package-info.java
index 1e925d7..d7c799d 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/package-info.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/package-info.java
@@ -19,8 +19,4 @@
* Package org.apache.hadoop.yarn.util.resource contains classes
* which is used as utility class for resource profile computations.
*/
-@InterfaceAudience.Public
-@InterfaceStability.Unstable
-package org.apache.hadoop.yarn.util.resource;
-import org.apache.hadoop.classification.InterfaceAudience;
-import org.apache.hadoop.classification.InterfaceStability;
\ No newline at end of file
+package org.apache.hadoop.yarn.util.resource;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 768deb2..20442d9 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -3381,6 +3381,16 @@
<value>0.0.0.0:8091</value>
</property>
+ <!-- resource types configuration -->
+ <property>
+ <name>yarn.resource-types</name>
+ <value></value>
+ <description>
+ The resource types to be used for scheduling. Use resource-types.xml
+ to specify details about the individual resource types.
+ </description>
+ </property>
+
<property>
<name>yarn.resourcemanager.display.per-user-apps</name>
<value>false</value>
@@ -3481,11 +3491,43 @@
<!-- resource types configuration -->
<property>
- <name>yarn.resource-types</name>
+ <description>
+ When yarn.nodemanager.resource.gpu.allowed-gpu-devices=auto specified,
+ YARN NodeManager needs to run GPU discovery binary (now only support
+ nvidia-smi) to get GPU-related information.
+ When value is empty (default), YARN NodeManager will try to locate
+ discovery executable itself.
+ An example of the config value is: /usr/local/bin/nvidia-smi
+ </description>
+ <name>yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables</name>
<value></value>
+ </property>
+
+ <property>
<description>
- The resource types to be used for scheduling. Use resource-types.xml
- to specify details about the individual resource types.
+ Enable additional discovery/isolation of resources on the NodeManager,
+ split by comma. By default, this is empty. Acceptable values: { "yarn-io/gpu" }.
+ </description>
+ <name>yarn.nodemanager.resource-plugins</name>
+ <value></value>
+ </property>
+
+ <property>
+ <description>
+ Specify GPU devices which can be managed by YARN NodeManager, split by comma
+ Number of GPU devices will be reported to RM to make scheduling decisions.
+ Set to auto (default) let YARN automatically discover GPU resource from
+ system.
+ Manually specify GPU devices if auto detect GPU device failed or admin
+ only want subset of GPU devices managed by YARN. GPU device is identified
+ by their minor device number. A common approach to get minor device number
+ of GPUs is using "nvidia-smi -q" and search "Minor Number" output. An
+ example of manual specification is "0,1,2,4" to allow YARN NodeManager
+ to manage GPU devices with minor number 0/1/2/4.
</description>
+ <name>yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices</name>
+ <value>auto</value>
</property>
+
+
</configuration>
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java
index d6bab92..80555ca 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java
@@ -52,6 +52,23 @@ public class TestResourceUtils {
}
}
+ public static void addNewTypesToResources(String... resourceTypes) {
+ // Initialize resource map
+ Map<String, ResourceInformation> riMap = new HashMap<>();
+
+ // Initialize mandatory resources
+ riMap.put(ResourceInformation.MEMORY_URI, ResourceInformation.MEMORY_MB);
+ riMap.put(ResourceInformation.VCORES_URI, ResourceInformation.VCORES);
+
+ for (String newResource : resourceTypes) {
+ riMap.put(newResource, ResourceInformation
+ .newInstance(newResource, "", 0, ResourceTypes.COUNTABLE, 0,
+ Integer.MAX_VALUE));
+ }
+
+ ResourceUtils.initializeResourcesFromResourceInformationMap(riMap);
+ }
+
@Before
public void setup() {
ResourceUtils.resetResourceTypes();
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
index 9454da4..e65b677 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
@@ -112,9 +112,10 @@ public abstract class ContainerExecutor implements Configurable {
* Run the executor initialization steps.
* Verify that the necessary configs and permissions are in place.
*
+ * @param nmContext Context of NM
* @throws IOException if initialization fails
*/
- public abstract void init() throws IOException;
+ public abstract void init(Context nmContext) throws IOException;
/**
* This function localizes the JAR file on-demand.
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java
index 33cefea..7e16034 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java
@@ -34,6 +34,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManag
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
@@ -122,4 +123,6 @@ public interface Context {
void setNMTimelinePublisher(NMTimelinePublisher nmMetricsPublisher);
NMTimelinePublisher getNMTimelinePublisher();
+
+ ResourcePluginManager getResourcePluginManager();
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java
index b54b7f5..e659c3e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java
@@ -134,7 +134,7 @@ public class DefaultContainerExecutor extends ContainerExecutor {
}
@Override
- public void init() throws IOException {
+ public void init(Context nmContext) throws IOException {
// nothing to do or verify here
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
index 765c49a..04f27c2 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
@@ -20,6 +20,7 @@ package org.apache.hadoop.yarn.server.nodemanager;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -281,7 +282,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
}
@Override
- public void init() throws IOException {
+ public void init(Context nmContext) throws IOException {
Configuration conf = super.getConf();
// Send command to executor which will just start up,
@@ -305,7 +306,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
try {
resourceHandlerChain = ResourceHandlerModule
- .getConfiguredResourceHandlerChain(conf);
+ .getConfiguredResourceHandlerChain(conf, nmContext);
if (LOG.isDebugEnabled()) {
LOG.debug("Resource handler chain enabled = " + (resourceHandlerChain
!= null));
@@ -845,4 +846,9 @@ public class LinuxContainerExecutor extends ContainerExecutor {
e);
}
}
+
+ @VisibleForTesting
+ public ResourceHandler getResourceHandler() {
+ return resourceHandlerChain;
+ }
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
index fcb5474..c74b54e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
@@ -18,23 +18,7 @@
package org.apache.hadoop.yarn.server.nodemanager;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.ConcurrentSkipListMap;
-import java.util.concurrent.atomic.AtomicBoolean;
-
-import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
-import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl;
-import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
-import org.apache.hadoop.yarn.state.MultiStateTransitionListener;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
+import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@@ -65,12 +49,16 @@ import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport;
import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
-import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
import org.apache.hadoop.yarn.server.nodemanager.collectormanager.NMCollectorService;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ConfigurationNodeLabelsProvider;
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider;
@@ -78,14 +66,25 @@ import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ScriptBasedNodeLabel
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
-import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM;
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer;
+import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
+import org.apache.hadoop.yarn.state.MultiStateTransitionListener;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
-import com.google.common.annotations.VisibleForTesting;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.atomic.AtomicBoolean;
public class NodeManager extends CompositeService
implements EventHandler<NodeManagerEvent> {
@@ -332,6 +331,18 @@ public class NodeManager extends CompositeService
nmCheckintervalTime, scriptTimeout, scriptArgs);
}
+ @VisibleForTesting
+ protected ResourcePluginManager createResourcePluginManager() {
+ return new ResourcePluginManager();
+ }
+
+ @VisibleForTesting
+ protected ContainerExecutor createContainerExecutor(Configuration conf) {
+ return ReflectionUtils.newInstance(
+ conf.getClass(YarnConfiguration.NM_CONTAINER_EXECUTOR,
+ DefaultContainerExecutor.class, ContainerExecutor.class), conf);
+ }
+
@Override
protected void serviceInit(Configuration conf) throws Exception {
@@ -360,11 +371,20 @@ public class NodeManager extends CompositeService
this.aclsManager = new ApplicationACLsManager(conf);
- ContainerExecutor exec = ReflectionUtils.newInstance(
- conf.getClass(YarnConfiguration.NM_CONTAINER_EXECUTOR,
- DefaultContainerExecutor.class, ContainerExecutor.class), conf);
+ boolean isDistSchedulingEnabled =
+ conf.getBoolean(YarnConfiguration.DIST_SCHEDULING_ENABLED,
+ YarnConfiguration.DEFAULT_DIST_SCHEDULING_ENABLED);
+
+ this.context = createNMContext(containerTokenSecretManager,
+ nmTokenSecretManager, nmStore, isDistSchedulingEnabled, conf);
+
+ ResourcePluginManager pluginManager = createResourcePluginManager();
+ pluginManager.initialize(context);
+ ((NMContext)context).setResourcePluginManager(pluginManager);
+
+ ContainerExecutor exec = createContainerExecutor(conf);
try {
- exec.init();
+ exec.init(context);
} catch (IOException e) {
throw new YarnRuntimeException("Failed to initialize container executor", e);
}
@@ -380,13 +400,6 @@ public class NodeManager extends CompositeService
getNodeHealthScriptRunner(conf), dirsHandler);
addService(nodeHealthChecker);
- boolean isDistSchedulingEnabled =
- conf.getBoolean(YarnConfiguration.DIST_SCHEDULING_ENABLED,
- YarnConfiguration.DEFAULT_DIST_SCHEDULING_ENABLED);
-
- this.context = createNMContext(containerTokenSecretManager,
- nmTokenSecretManager, nmStore, isDistSchedulingEnabled, conf);
-
((NMContext)context).setContainerExecutor(exec);
@@ -460,6 +473,12 @@ public class NodeManager extends CompositeService
try {
super.serviceStop();
DefaultMetricsSystem.shutdown();
+
+ // Cleanup ResourcePluginManager
+ ResourcePluginManager rpm = context.getResourcePluginManager();
+ if (rpm != null) {
+ rpm.cleanup();
+ }
} finally {
// YARN-3641: NM's services stop get failed shouldn't block the
// release of NMLevelDBStore.
@@ -607,6 +626,8 @@ public class NodeManager extends CompositeService
private NMTimelinePublisher nmTimelinePublisher;
+ private ResourcePluginManager resourcePluginManager;
+
public NMContext(NMContainerTokenSecretManager containerTokenSecretManager,
NMTokenSecretManagerInNM nmTokenSecretManager,
LocalDirsHandlerService dirsHandler, ApplicationACLsManager aclsManager,
@@ -807,6 +828,15 @@ public class NodeManager extends CompositeService
public NMTimelinePublisher getNMTimelinePublisher() {
return nmTimelinePublisher;
}
+
+ public ResourcePluginManager getResourcePluginManager() {
+ return resourcePluginManager;
+ }
+
+ public void setResourcePluginManager(
+ ResourcePluginManager resourcePluginManager) {
+ this.resourcePluginManager = resourcePluginManager;
+ }
}
/**
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java
index 888ee85..d776bdf 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java
@@ -33,6 +33,9 @@ import java.util.Map.Entry;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
+
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -178,14 +181,15 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
long memoryMb = totalResource.getMemorySize();
float vMemToPMem =
conf.getFloat(
- YarnConfiguration.NM_VMEM_PMEM_RATIO,
- YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO);
+ YarnConfiguration.NM_VMEM_PMEM_RATIO,
+ YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO);
long virtualMemoryMb = (long)Math.ceil(memoryMb * vMemToPMem);
-
int virtualCores = totalResource.getVirtualCores();
- LOG.info("Nodemanager resources: memory set to " + memoryMb + "MB.");
- LOG.info("Nodemanager resources: vcores set to " + virtualCores + ".");
- LOG.info("Nodemanager resources: " + totalResource);
+
+ // Update configured resources via plugins.
+ updateConfiguredResourcesViaPlugins(totalResource);
+
+ LOG.info("Nodemanager resources is set to: " + totalResource);
metrics.addResource(totalResource);
@@ -342,12 +346,27 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
return ServerRMProxy.createRMProxy(conf, ResourceTracker.class);
}
+ private void updateConfiguredResourcesViaPlugins(
+ Resource configuredResource) throws YarnException {
+ ResourcePluginManager pluginManager = context.getResourcePluginManager();
+ if (pluginManager != null && pluginManager.getNameToPlugins() != null) {
+ // Update configured resource
+ for (ResourcePlugin resourcePlugin : pluginManager.getNameToPlugins()
+ .values()) {
+ if (resourcePlugin.getNodeResourceHandlerInstance() != null) {
+ resourcePlugin.getNodeResourceHandlerInstance()
+ .updateConfiguredResource(configuredResource);
+ }
+ }
+ }
+ }
+
@VisibleForTesting
protected void registerWithRM()
throws YarnException, IOException {
RegisterNodeManagerResponse regNMResponse;
Set<NodeLabel> nodeLabels = nodeLabelsHandler.getNodeLabelsForRegistration();
-
+
// Synchronize NM-RM registration with
// ContainerManagerImpl#increaseContainersResource and
// ContainerManagerImpl#startContainers to avoid race condition
@@ -358,6 +377,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
RegisterNodeManagerRequest.newInstance(nodeId, httpPort, totalResource,
nodeManagerVersionId, containerReports, getRunningApplications(),
nodeLabels, physicalResource);
+
if (containerReports != null) {
LOG.info("Registering with RM using containers :" + containerReports);
}
@@ -406,7 +426,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
if (masterKey != null) {
this.context.getContainerTokenSecretManager().setMasterKey(masterKey);
}
-
+
masterKey = regNMResponse.getNMTokenMasterKey();
if (masterKey != null) {
this.context.getNMTokenSecretManager().setMasterKey(masterKey);
@@ -733,7 +753,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
}
}
}
-
+
@Override
public long getRMIdentifier() {
return this.rmIdentifier;
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java
index 8402a16..db0b225 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java
@@ -51,6 +51,7 @@ public class PrivilegedOperation {
TC_READ_STATS("--tc-read-stats"),
ADD_PID_TO_CGROUP(""), //no CLI switch supported yet.
RUN_DOCKER_CMD("--run-docker"),
+ GPU("--module-gpu"),
LIST_AS_USER(""); //no CLI switch supported yet.
private final String option;
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java
index 955d216..72bf30c 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java
@@ -20,6 +20,7 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;
+import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
@@ -135,7 +136,8 @@ public class ResourceHandlerChain implements ResourceHandler {
return allOperations;
}
- List<ResourceHandler> getResourceHandlerList() {
+ @VisibleForTesting
+ public List<ResourceHandler> getResourceHandlerList() {
return Collections.unmodifiableList(resourceHandlers);
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java
index 3c61cd4..ce850ab 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java
@@ -21,25 +21,28 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;
import com.google.common.annotations.VisibleForTesting;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.util.CgroupsLCEResourcesHandler;
import org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
-import java.util.Set;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.HashMap;
-import java.util.Arrays;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
+import java.util.Map;
+import java.util.Set;
/**
* Provides mechanisms to get various resource handlers - cpu, memory, network,
@@ -206,22 +209,41 @@ public class ResourceHandlerModule {
}
private static void initializeConfiguredResourceHandlerChain(
- Configuration conf) throws ResourceHandlerException {
+ Configuration conf, Context nmContext)
+ throws ResourceHandlerException {
ArrayList<ResourceHandler> handlerList = new ArrayList<>();
addHandlerIfNotNull(handlerList, getOutboundBandwidthResourceHandler(conf));
addHandlerIfNotNull(handlerList, getDiskResourceHandler(conf));
addHandlerIfNotNull(handlerList, getMemoryResourceHandler(conf));
addHandlerIfNotNull(handlerList, getCGroupsCpuResourceHandler(conf));
+ addHandlersFromConfiguredResourcePlugins(handlerList, conf, nmContext);
resourceHandlerChain = new ResourceHandlerChain(handlerList);
}
+ private static void addHandlersFromConfiguredResourcePlugins(
+ List<ResourceHandler> handlerList, Configuration conf,
+ Context nmContext) throws ResourceHandlerException {
+ ResourcePluginManager pluginManager = nmContext.getResourcePluginManager();
+ if (pluginManager != null) {
+ Map<String, ResourcePlugin> pluginMap = pluginManager.getNameToPlugins();
+ if (pluginMap != null) {
+ for (ResourcePlugin plugin : pluginMap.values()) {
+ addHandlerIfNotNull(handlerList, plugin
+ .createResourceHandler(nmContext,
+ getInitializedCGroupsHandler(conf),
+ PrivilegedOperationExecutor.getInstance(conf)));
+ }
+ }
+ }
+ }
+
public static ResourceHandlerChain getConfiguredResourceHandlerChain(
- Configuration conf) throws ResourceHandlerException {
+ Configuration conf, Context nmContext) throws ResourceHandlerException {
if (resourceHandlerChain == null) {
synchronized (ResourceHandlerModule.class) {
if (resourceHandlerChain == null) {
- initializeConfiguredResourceHandlerChain(conf);
+ initializeConfiguredResourceHandlerChain(conf, nmContext);
}
}
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
new file mode 100644
index 0000000..d6bae09
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
@@ -0,0 +1,242 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Sets;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
+import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
+
+/**
+ * Allocate GPU resources according to requirements
+ */
+public class GpuResourceAllocator {
+ final static Log LOG = LogFactory.getLog(GpuResourceAllocator.class);
+
+ private Set<Integer> allowedGpuDevices = new TreeSet<>();
+ private Map<Integer, ContainerId> usedDevices = new TreeMap<>();
+ private Context nmContext;
+
+ public GpuResourceAllocator(Context ctx) {
+ this.nmContext = ctx;
+ }
+
+ /**
+ * Contains allowed and denied devices with minor number.
+ * Denied devices will be useful for cgroups devices module to do blacklisting
+ */
+ static class GpuAllocation {
+ private Set<Integer> allowed = Collections.emptySet();
+ private Set<Integer> denied = Collections.emptySet();
+
+ GpuAllocation(Set<Integer> allowed, Set<Integer> denied) {
+ if (allowed != null) {
+ this.allowed = ImmutableSet.copyOf(allowed);
+ }
+ if (denied != null) {
+ this.denied = ImmutableSet.copyOf(denied);
+ }
+ }
+
+ public Set<Integer> getAllowedGPUs() {
+ return allowed;
+ }
+
+ public Set<Integer> getDeniedGPUs() {
+ return denied;
+ }
+ }
+
+ /**
+ * Add GPU to allowed list
+ * @param minorNumber minor number of the GPU device.
+ */
+ public synchronized void addGpu(int minorNumber) {
+ allowedGpuDevices.add(minorNumber);
+ }
+
+ private String getResourceHandlerExceptionMessage(int numRequestedGpuDevices,
+ ContainerId containerId) {
+ return "Failed to find enough GPUs, requestor=" + containerId
+ + ", #RequestedGPUs=" + numRequestedGpuDevices + ", #availableGpus="
+ + getAvailableGpus();
+ }
+
+ @VisibleForTesting
+ public synchronized int getAvailableGpus() {
+ return allowedGpuDevices.size() - usedDevices.size();
+ }
+
+ public synchronized void recoverAssignedGpus(ContainerId containerId)
+ throws ResourceHandlerException {
+ Container c = nmContext.getContainers().get(containerId);
+ if (null == c) {
+ throw new ResourceHandlerException(
+ "This shouldn't happen, cannot find container with id="
+ + containerId);
+ }
+
+ for (Serializable deviceId : c.getResourceMappings().getAssignedResources(
+ GPU_URI)){
+ if (!(deviceId instanceof String)) {
+ throw new ResourceHandlerException(
+ "Trying to recover device id, however it"
+ + " is not String, this shouldn't happen");
+ }
+
+
+ int devId;
+ try {
+ devId = Integer.parseInt((String)deviceId);
+ } catch (NumberFormatException e) {
+ throw new ResourceHandlerException("Failed to recover device id because"
+ + "it is not a valid integer, devId:" + deviceId);
+ }
+
+ // Make sure it is in allowed GPU device.
+ if (!allowedGpuDevices.contains(devId)) {
+ throw new ResourceHandlerException("Try to recover device id = " + devId
+ + " however it is not in allowed device list:" + StringUtils
+ .join(",", allowedGpuDevices));
+ }
+
+ // Make sure it is not occupied by anybody else
+ if (usedDevices.containsKey(devId)) {
+ throw new ResourceHandlerException("Try to recover device id = " + devId
+ + " however it is already assigned to container=" + usedDevices
+ .get(devId) + ", please double check what happened.");
+ }
+
+ usedDevices.put(devId, containerId);
+ }
+ }
+
+ private int getRequestedGpus(Resource requestedResource) {
+ try {
+ return Long.valueOf(requestedResource.getResourceValue(
+ GPU_URI)).intValue();
+ } catch (ResourceNotFoundException e) {
+ return 0;
+ }
+ }
+
+ /**
+ * Assign GPU to requestor
+ * @param container container to allocate
+ * @return List of denied Gpus with minor numbers
+ * @throws ResourceHandlerException When failed to
+ */
+ public synchronized GpuAllocation assignGpus(Container container)
+ throws ResourceHandlerException {
+ Resource requestedResource = container.getResource();
+ ContainerId containerId = container.getContainerId();
+ int numRequestedGpuDevices = getRequestedGpus(requestedResource);
+ // Assign Gpus to container if requested some.
+ if (numRequestedGpuDevices > 0) {
+ if (numRequestedGpuDevices > getAvailableGpus()) {
+ throw new ResourceHandlerException(
+ getResourceHandlerExceptionMessage(numRequestedGpuDevices,
+ containerId));
+ }
+
+ Set<Integer> assignedGpus = new HashSet<>();
+
+ for (int deviceNum : allowedGpuDevices) {
+ if (!usedDevices.containsKey(deviceNum)) {
+ usedDevices.put(deviceNum, containerId);
+ assignedGpus.add(deviceNum);
+ if (assignedGpus.size() == numRequestedGpuDevices) {
+ break;
+ }
+ }
+ }
+
+ // Record in state store if we allocated anything
+ if (!assignedGpus.isEmpty()) {
+ List<Serializable> allocatedDevices = new ArrayList<>();
+ for (int gpu : assignedGpus) {
+ allocatedDevices.add(String.valueOf(gpu));
+ }
+ try {
+ // Update Container#getResourceMapping.
+ ResourceMappings.AssignedResources assignedResources =
+ new ResourceMappings.AssignedResources();
+ assignedResources.updateAssignedResources(allocatedDevices);
+ container.getResourceMappings().addAssignedResources(GPU_URI,
+ assignedResources);
+
+ // Update state store.
+ nmContext.getNMStateStore().storeAssignedResources(containerId,
+ GPU_URI, allocatedDevices);
+ } catch (IOException e) {
+ cleanupAssignGpus(containerId);
+ throw new ResourceHandlerException(e);
+ }
+ }
+
+ return new GpuAllocation(assignedGpus,
+ Sets.difference(allowedGpuDevices, assignedGpus));
+ }
+ return new GpuAllocation(null, allowedGpuDevices);
+ }
+
+ /**
+ * Clean up all Gpus assigned to containerId
+ * @param containerId containerId
+ */
+ public synchronized void cleanupAssignGpus(ContainerId containerId) {
+ Iterator<Map.Entry<Integer, ContainerId>> iter =
+ usedDevices.entrySet().iterator();
+ while (iter.hasNext()) {
+ if (iter.next().getValue().equals(containerId)) {
+ iter.remove();
+ }
+ }
+ }
+
+ @VisibleForTesting
+ public synchronized Map<Integer, ContainerId> getDeviceAllocationMapping() {
+ return new HashMap<>(usedDevices);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
new file mode 100644
index 0000000..7144bb2
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
@@ -0,0 +1,153 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
+import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class GpuResourceHandlerImpl implements ResourceHandler {
+ final static Log LOG = LogFactory
+ .getLog(GpuResourceHandlerImpl.class);
+
+ // This will be used by container-executor to add necessary clis
+ public static final String EXCLUDED_GPUS_CLI_OPTION = "--excluded_gpus";
+ public static final String CONTAINER_ID_CLI_OPTION = "--container_id";
+
+ private GpuResourceAllocator gpuAllocator;
+ private CGroupsHandler cGroupsHandler;
+ private PrivilegedOperationExecutor privilegedOperationExecutor;
+
+ public GpuResourceHandlerImpl(Context nmContext,
+ CGroupsHandler cGroupsHandler,
+ PrivilegedOperationExecutor privilegedOperationExecutor) {
+ this.cGroupsHandler = cGroupsHandler;
+ this.privilegedOperationExecutor = privilegedOperationExecutor;
+ gpuAllocator = new GpuResourceAllocator(nmContext);
+ }
+
+ @Override
+ public List<PrivilegedOperation> bootstrap(Configuration configuration)
+ throws ResourceHandlerException {
+ List<Integer> minorNumbersOfUsableGpus;
+ try {
+ minorNumbersOfUsableGpus = GpuDiscoverer.getInstance()
+ .getMinorNumbersOfGpusUsableByYarn();
+ } catch (YarnException e) {
+ LOG.error("Exception when trying to get usable GPU device", e);
+ throw new ResourceHandlerException(e);
+ }
+
+ for (int minorNumber : minorNumbersOfUsableGpus) {
+ gpuAllocator.addGpu(minorNumber);
+ }
+
+ // And initialize cgroups
+ this.cGroupsHandler.initializeCGroupController(
+ CGroupsHandler.CGroupController.DEVICES);
+
+ return null;
+ }
+
+ @Override
+ public synchronized List<PrivilegedOperation> preStart(Container container)
+ throws ResourceHandlerException {
+ String containerIdStr = container.getContainerId().toString();
+
+ // Assign Gpus to container if requested some.
+ GpuResourceAllocator.GpuAllocation allocation = gpuAllocator.assignGpus(
+ container);
+
+ // Create device cgroups for the container
+ cGroupsHandler.createCGroup(CGroupsHandler.CGroupController.DEVICES,
+ containerIdStr);
+ try {
+ // Execute c-e to setup GPU isolation before launch the container
+ PrivilegedOperation privilegedOperation = new PrivilegedOperation(
+ PrivilegedOperation.OperationType.GPU, Arrays
+ .asList(CONTAINER_ID_CLI_OPTION, containerIdStr));
+ if (!allocation.getDeniedGPUs().isEmpty()) {
+ privilegedOperation.appendArgs(Arrays.asList(EXCLUDED_GPUS_CLI_OPTION,
+ StringUtils.join(",", allocation.getDeniedGPUs())));
+ }
+
+ privilegedOperationExecutor.executePrivilegedOperation(
+ privilegedOperation, true);
+ } catch (PrivilegedOperationException e) {
+ cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES,
+ containerIdStr);
+ LOG.warn("Could not update cgroup for container", e);
+ throw new ResourceHandlerException(e);
+ }
+
+ List<PrivilegedOperation> ret = new ArrayList<>();
+ ret.add(new PrivilegedOperation(
+ PrivilegedOperation.OperationType.ADD_PID_TO_CGROUP,
+ PrivilegedOperation.CGROUP_ARG_PREFIX
+ + cGroupsHandler.getPathForCGroupTasks(
+ CGroupsHandler.CGroupController.DEVICES, containerIdStr)));
+
+ return ret;
+ }
+
+ @VisibleForTesting
+ public GpuResourceAllocator getGpuAllocator() {
+ return gpuAllocator;
+ }
+
+ @Override
+ public List<PrivilegedOperation> reacquireContainer(ContainerId containerId)
+ throws ResourceHandlerException {
+ gpuAllocator.recoverAssignedGpus(containerId);
+ return null;
+ }
+
+ @Override
+ public synchronized List<PrivilegedOperation> postComplete(
+ ContainerId containerId) throws ResourceHandlerException {
+ gpuAllocator.cleanupAssignGpus(containerId);
+ cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES,
+ containerId.toString());
+ return null;
+ }
+
+ @Override
+ public List<PrivilegedOperation> teardown() throws ResourceHandlerException {
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/NodeResourceUpdaterPlugin.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/NodeResourceUpdaterPlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/NodeResourceUpdaterPlugin.java
new file mode 100644
index 0000000..88f77ed
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/NodeResourceUpdaterPlugin.java
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
+
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+
+/**
+ * Plugins to handle resources on a node. This will be used by
+ * {@link org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater}
+ */
+public abstract class NodeResourceUpdaterPlugin {
+ /**
+ * Update configured resource for the given component.
+ * @param res resource passed in by external mododule (such as
+ * {@link org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater}
+ * @throws YarnException when any issue happens.
+ */
+ public abstract void updateConfiguredResource(Resource res)
+ throws YarnException;
+
+ /**
+ * This method will be called when the node's resource is loaded from
+ * dynamic-resources.xml in ResourceManager.
+ *
+ * @param newResource newResource reported by RM
+ * @throws YarnException when any mismatch between NM/RM
+ */
+ public void handleUpdatedResourceFromRM(Resource newResource) throws
+ YarnException {
+ // by default do nothing, subclass should implement this method when any
+ // special activities required upon new resource reported by RM.
+ }
+
+ // TODO: add implementation to update node attribute once YARN-3409 merged.
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java
new file mode 100644
index 0000000..6e134b3
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
+
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain;
+
+/**
+ * {@link ResourcePlugin} is an interface for node manager to easier support
+ * discovery/manage/isolation for new resource types.
+ *
+ * <p>
+ * It has two major part: {@link ResourcePlugin#createResourceHandler(Context,
+ * CGroupsHandler, PrivilegedOperationExecutor)} and
+ * {@link ResourcePlugin#getNodeResourceHandlerInstance()}, see javadocs below
+ * for more details.
+ * </p>
+ */
+public interface ResourcePlugin {
+ /**
+ * Initialize the plugin, this will be invoked during NM startup.
+ * @param context NM Context
+ * @throws YarnException when any issue occurs
+ */
+ void initialize(Context context) throws YarnException;
+
+ /**
+ * Plugin needs to return {@link ResourceHandler} when any special isolation
+ * required for the resource type. This will be added to
+ * {@link ResourceHandlerChain} during NodeManager startup. When no special
+ * isolation need, return null.
+ *
+ * @param nmContext NodeManager context.
+ * @param cGroupsHandler CGroupsHandler
+ * @param privilegedOperationExecutor Privileged Operation Executor.
+ * @return ResourceHandler
+ */
+ ResourceHandler createResourceHandler(Context nmContext,
+ CGroupsHandler cGroupsHandler,
+ PrivilegedOperationExecutor privilegedOperationExecutor);
+
+ /**
+ * Plugin needs to return {@link NodeResourceUpdaterPlugin} when any discovery
+ * mechanism required for the resource type. For example, if we want to set
+ * resource-value during NM registration or send update during NM-RM heartbeat
+ * We can implement a {@link NodeResourceUpdaterPlugin} and update fields of
+ * {@link org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest}
+ * or {@link org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest}
+ *
+ * This will be invoked during every node status update or node registration,
+ * please avoid creating new instance every time.
+ *
+ * @return NodeResourceUpdaterPlugin, could be null when no discovery needed.
+ */
+ NodeResourceUpdaterPlugin getNodeResourceHandlerInstance();
+
+ /**
+ * Do cleanup of the plugin, this will be invoked when
+ * {@link org.apache.hadoop.yarn.server.nodemanager.NodeManager} stops
+ * @throws YarnException if any issue occurs
+ */
+ void cleanup() throws YarnException;
+}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/51222908/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java
new file mode 100644
index 0000000..73d6038
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin;
+
+import com.google.common.collect.ImmutableSet;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
+
+/**
+ * Manages {@link ResourcePlugin} configured on this NodeManager.
+ */
+public class ResourcePluginManager {
+ private static final Logger LOG =
+ LoggerFactory.getLogger(ResourcePluginManager.class);
+ private static final Set<String> SUPPORTED_RESOURCE_PLUGINS = ImmutableSet.of(
+ GPU_URI);
+
+ private Map<String, ResourcePlugin> configuredPlugins = Collections.EMPTY_MAP;
+
+ public synchronized void initialize(Context context)
+ throws YarnException {
+ Configuration conf = context.getConf();
+ String[] plugins = conf.getStrings(YarnConfiguration.NM_RESOURCE_PLUGINS);
+
+ if (plugins != null) {
+ Map<String, ResourcePlugin> pluginMap = new HashMap<>();
+
+ // Initialize each plugins
+ for (String resourceName : plugins) {
+ resourceName = resourceName.trim();
+ if (!SUPPORTED_RESOURCE_PLUGINS.contains(resourceName)) {
+ String msg =
+ "Trying to initialize resource plugin with name=" + resourceName
+ + ", it is not supported, list of supported plugins:"
+ + StringUtils.join(",",
+ SUPPORTED_RESOURCE_PLUGINS);
+ LOG.error(msg);
+ throw new YarnException(msg);
+ }
+
+ if (pluginMap.containsKey(resourceName)) {
+ // Duplicated items, ignore ...
+ continue;
+ }
+
+ ResourcePlugin plugin = null;
+ if (resourceName.equals(GPU_URI)) {
+ plugin = new GpuResourcePlugin();
+ }
+
+ if (plugin == null) {
+ throw new YarnException(
+ "This shouldn't happen, plugin=" + resourceName
+ + " should be loaded and initialized");
+ }
+ plugin.initialize(context);
+ pluginMap.put(resourceName, plugin);
+ }
+
+ configuredPlugins = Collections.unmodifiableMap(pluginMap);
+ }
+ }
+
+ public synchronized void cleanup() throws YarnException {
+ for (ResourcePlugin plugin : configuredPlugins.values()) {
+ plugin.cleanup();
+ }
+ }
+
+ /**
+ * Get resource name (such as gpu/fpga) to plugin references.
+ * @return read-only map of resource name to plugins.
+ */
+ public synchronized Map<String, ResourcePlugin> getNameToPlugins() {
+ return configuredPlugins;
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org