You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by jh...@apache.org on 2019/03/27 18:48:49 UTC
[hadoop] 03/11: YARN-9174. Backport YARN-7224 for refactoring of
GpuDevice class
This is an automated email from the ASF dual-hosted git repository.
jhung pushed a commit to branch YARN-8200.branch3
in repository https://gitbox.apache.org/repos/asf/hadoop.git
commit bd75c792b533b95177f76c87b09709c33687f766
Author: Jonathan Hung <jh...@linkedin.com>
AuthorDate: Thu Apr 12 13:56:16 2018 -0700
YARN-9174. Backport YARN-7224 for refactoring of GpuDevice class
---
.../linux/resources/gpu/GpuResourceAllocator.java | 102 ++++++--------
.../resources/gpu/GpuResourceHandlerImpl.java | 26 ++--
.../resourceplugin/gpu/GpuDevice.java | 78 +++++++++++
.../resourceplugin/gpu/GpuDiscoverer.java | 30 ++--
.../gpu/GpuNodeResourceUpdateHandler.java | 10 +-
.../recovery/NMLeveldbStateStoreService.java | 65 +++++----
.../recovery/NMNullStateStoreService.java | 3 +-
.../nodemanager/recovery/NMStateStoreService.java | 15 +-
.../TestContainerManagerRecovery.java | 9 +-
.../resources/gpu/TestGpuResourceHandler.java | 156 ++++++++++++++-------
.../resourceplugin/gpu/TestGpuDiscoverer.java | 34 +++--
.../recovery/NMMemoryStateStoreService.java | 8 +-
.../recovery/TestNMLeveldbStateStoreService.java | 22 ++-
13 files changed, 381 insertions(+), 177 deletions(-)
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
index d6bae09..f2bb342 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java
@@ -26,12 +26,11 @@ import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.Resource;
-import org.apache.hadoop.yarn.api.records.ResourceInformation;
import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
-import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
import java.io.IOException;
import java.io.Serializable;
@@ -54,8 +53,8 @@ import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
public class GpuResourceAllocator {
final static Log LOG = LogFactory.getLog(GpuResourceAllocator.class);
- private Set<Integer> allowedGpuDevices = new TreeSet<>();
- private Map<Integer, ContainerId> usedDevices = new TreeMap<>();
+ private Set<GpuDevice> allowedGpuDevices = new TreeSet<>();
+ private Map<GpuDevice, ContainerId> usedDevices = new TreeMap<>();
private Context nmContext;
public GpuResourceAllocator(Context ctx) {
@@ -63,14 +62,14 @@ public class GpuResourceAllocator {
}
/**
- * Contains allowed and denied devices with minor number.
+ * Contains allowed and denied devices
* Denied devices will be useful for cgroups devices module to do blacklisting
*/
static class GpuAllocation {
- private Set<Integer> allowed = Collections.emptySet();
- private Set<Integer> denied = Collections.emptySet();
+ private Set<GpuDevice> allowed = Collections.emptySet();
+ private Set<GpuDevice> denied = Collections.emptySet();
- GpuAllocation(Set<Integer> allowed, Set<Integer> denied) {
+ GpuAllocation(Set<GpuDevice> allowed, Set<GpuDevice> denied) {
if (allowed != null) {
this.allowed = ImmutableSet.copyOf(allowed);
}
@@ -79,21 +78,21 @@ public class GpuResourceAllocator {
}
}
- public Set<Integer> getAllowedGPUs() {
+ public Set<GpuDevice> getAllowedGPUs() {
return allowed;
}
- public Set<Integer> getDeniedGPUs() {
+ public Set<GpuDevice> getDeniedGPUs() {
return denied;
}
}
/**
* Add GPU to allowed list
- * @param minorNumber minor number of the GPU device.
+ * @param gpuDevice gpu device
*/
- public synchronized void addGpu(int minorNumber) {
- allowedGpuDevices.add(minorNumber);
+ public synchronized void addGpu(GpuDevice gpuDevice) {
+ allowedGpuDevices.add(gpuDevice);
}
private String getResourceHandlerExceptionMessage(int numRequestedGpuDevices,
@@ -117,42 +116,42 @@ public class GpuResourceAllocator {
+ containerId);
}
- for (Serializable deviceId : c.getResourceMappings().getAssignedResources(
- GPU_URI)){
- if (!(deviceId instanceof String)) {
+ for (Serializable gpuDeviceSerializable : c.getResourceMappings()
+ .getAssignedResources(GPU_URI)) {
+ if (!(gpuDeviceSerializable instanceof GpuDevice)) {
throw new ResourceHandlerException(
"Trying to recover device id, however it"
- + " is not String, this shouldn't happen");
+ + " is not GpuDevice, this shouldn't happen");
}
-
- int devId;
- try {
- devId = Integer.parseInt((String)deviceId);
- } catch (NumberFormatException e) {
- throw new ResourceHandlerException("Failed to recover device id because"
- + "it is not a valid integer, devId:" + deviceId);
- }
+ GpuDevice gpuDevice = (GpuDevice) gpuDeviceSerializable;
// Make sure it is in allowed GPU device.
- if (!allowedGpuDevices.contains(devId)) {
- throw new ResourceHandlerException("Try to recover device id = " + devId
- + " however it is not in allowed device list:" + StringUtils
- .join(",", allowedGpuDevices));
+ if (!allowedGpuDevices.contains(gpuDevice)) {
+ throw new ResourceHandlerException(
+ "Try to recover device = " + gpuDevice
+ + " however it is not in allowed device list:" + StringUtils
+ .join(",", allowedGpuDevices));
}
// Make sure it is not occupied by anybody else
- if (usedDevices.containsKey(devId)) {
- throw new ResourceHandlerException("Try to recover device id = " + devId
- + " however it is already assigned to container=" + usedDevices
- .get(devId) + ", please double check what happened.");
+ if (usedDevices.containsKey(gpuDevice)) {
+ throw new ResourceHandlerException(
+ "Try to recover device id = " + gpuDevice
+ + " however it is already assigned to container=" + usedDevices
+ .get(gpuDevice) + ", please double check what happened.");
}
- usedDevices.put(devId, containerId);
+ usedDevices.put(gpuDevice, containerId);
}
}
- private int getRequestedGpus(Resource requestedResource) {
+ /**
+ * Get number of requested GPUs from resource.
+ * @param requestedResource requested resource
+ * @return #gpus.
+ */
+ public static int getRequestedGpus(Resource requestedResource) {
try {
return Long.valueOf(requestedResource.getResourceValue(
GPU_URI)).intValue();
@@ -164,8 +163,8 @@ public class GpuResourceAllocator {
/**
* Assign GPU to requestor
* @param container container to allocate
- * @return List of denied Gpus with minor numbers
- * @throws ResourceHandlerException When failed to
+ * @return allocation results.
+ * @throws ResourceHandlerException When failed to assign GPUs.
*/
public synchronized GpuAllocation assignGpus(Container container)
throws ResourceHandlerException {
@@ -180,12 +179,12 @@ public class GpuResourceAllocator {
containerId));
}
- Set<Integer> assignedGpus = new HashSet<>();
+ Set<GpuDevice> assignedGpus = new TreeSet<>();
- for (int deviceNum : allowedGpuDevices) {
- if (!usedDevices.containsKey(deviceNum)) {
- usedDevices.put(deviceNum, containerId);
- assignedGpus.add(deviceNum);
+ for (GpuDevice gpu : allowedGpuDevices) {
+ if (!usedDevices.containsKey(gpu)) {
+ usedDevices.put(gpu, containerId);
+ assignedGpus.add(gpu);
if (assignedGpus.size() == numRequestedGpuDevices) {
break;
}
@@ -194,21 +193,10 @@ public class GpuResourceAllocator {
// Record in state store if we allocated anything
if (!assignedGpus.isEmpty()) {
- List<Serializable> allocatedDevices = new ArrayList<>();
- for (int gpu : assignedGpus) {
- allocatedDevices.add(String.valueOf(gpu));
- }
try {
- // Update Container#getResourceMapping.
- ResourceMappings.AssignedResources assignedResources =
- new ResourceMappings.AssignedResources();
- assignedResources.updateAssignedResources(allocatedDevices);
- container.getResourceMappings().addAssignedResources(GPU_URI,
- assignedResources);
-
// Update state store.
- nmContext.getNMStateStore().storeAssignedResources(containerId,
- GPU_URI, allocatedDevices);
+ nmContext.getNMStateStore().storeAssignedResources(container, GPU_URI,
+ new ArrayList<>(assignedGpus));
} catch (IOException e) {
cleanupAssignGpus(containerId);
throw new ResourceHandlerException(e);
@@ -226,7 +214,7 @@ public class GpuResourceAllocator {
* @param containerId containerId
*/
public synchronized void cleanupAssignGpus(ContainerId containerId) {
- Iterator<Map.Entry<Integer, ContainerId>> iter =
+ Iterator<Map.Entry<GpuDevice, ContainerId>> iter =
usedDevices.entrySet().iterator();
while (iter.hasNext()) {
if (iter.next().getValue().equals(containerId)) {
@@ -236,7 +224,7 @@ public class GpuResourceAllocator {
}
@VisibleForTesting
- public synchronized Map<Integer, ContainerId> getDeviceAllocationMapping() {
+ public synchronized Map<GpuDevice, ContainerId> getDeviceAllocationMapping() {
return new HashMap<>(usedDevices);
}
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
index 7144bb2..4a783d3 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
@@ -24,8 +24,6 @@ import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.records.ContainerId;
-import org.apache.hadoop.yarn.api.records.ResourceInformation;
-import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
@@ -35,6 +33,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileg
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
import java.util.ArrayList;
@@ -64,17 +63,23 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
@Override
public List<PrivilegedOperation> bootstrap(Configuration configuration)
throws ResourceHandlerException {
- List<Integer> minorNumbersOfUsableGpus;
+ List<GpuDevice> usableGpus;
try {
- minorNumbersOfUsableGpus = GpuDiscoverer.getInstance()
- .getMinorNumbersOfGpusUsableByYarn();
+ usableGpus = GpuDiscoverer.getInstance()
+ .getGpusUsableByYarn();
+ if (usableGpus == null || usableGpus.isEmpty()) {
+ String message = "GPU is enabled on the NodeManager, but couldn't find "
+ + "any usable GPU devices, please double check configuration.";
+ LOG.error(message);
+ throw new ResourceHandlerException(message);
+ }
} catch (YarnException e) {
LOG.error("Exception when trying to get usable GPU device", e);
throw new ResourceHandlerException(e);
}
- for (int minorNumber : minorNumbersOfUsableGpus) {
- gpuAllocator.addGpu(minorNumber);
+ for (GpuDevice gpu : usableGpus) {
+ gpuAllocator.addGpu(gpu);
}
// And initialize cgroups
@@ -102,10 +107,13 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
PrivilegedOperation.OperationType.GPU, Arrays
.asList(CONTAINER_ID_CLI_OPTION, containerIdStr));
if (!allocation.getDeniedGPUs().isEmpty()) {
+ List<Integer> minorNumbers = new ArrayList<>();
+ for (GpuDevice deniedGpu : allocation.getDeniedGPUs()) {
+ minorNumbers.add(deniedGpu.getMinorNumber());
+ }
privilegedOperation.appendArgs(Arrays.asList(EXCLUDED_GPUS_CLI_OPTION,
- StringUtils.join(",", allocation.getDeniedGPUs())));
+ StringUtils.join(",", minorNumbers)));
}
-
privilegedOperationExecutor.executePrivilegedOperation(
privilegedOperation, true);
} catch (PrivilegedOperationException e) {
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDevice.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDevice.java
new file mode 100644
index 0000000..8119924
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDevice.java
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import java.io.Serializable;
+
+/**
+ * This class is used to represent GPU device while allocation.
+ */
+public class GpuDevice implements Serializable, Comparable {
+ private int index;
+ private int minorNumber;
+ private static final long serialVersionUID = -6812314470754667710L;
+
+ public GpuDevice(int index, int minorNumber) {
+ this.index = index;
+ this.minorNumber = minorNumber;
+ }
+
+ public int getIndex() {
+ return index;
+ }
+
+ public int getMinorNumber() {
+ return minorNumber;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || !(obj instanceof GpuDevice)) {
+ return false;
+ }
+ GpuDevice other = (GpuDevice) obj;
+ return index == other.index && minorNumber == other.minorNumber;
+ }
+
+ @Override
+ public int compareTo(Object obj) {
+ if (obj == null || (!(obj instanceof GpuDevice))) {
+ return -1;
+ }
+
+ GpuDevice other = (GpuDevice) obj;
+
+ int result = Integer.compare(index, other.index);
+ if (0 != result) {
+ return result;
+ }
+ return Integer.compare(minorNumber, other.minorNumber);
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 47;
+ return prime * index + minorNumber;
+ }
+
+ @Override
+ public String toString() {
+ return "(index=" + index + ",minor_number=" + minorNumber + ")";
+ }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
index 61b8ce5..6e3cf13 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
@@ -136,12 +136,12 @@ public class GpuDiscoverer {
}
/**
- * Get list of minor device numbers of Gpu devices usable by YARN.
+ * Get list of GPU devices usable by YARN.
*
- * @return List of minor device numbers of Gpu devices.
+ * @return List of GPU devices
* @throws YarnException when any issue happens
*/
- public synchronized List<Integer> getMinorNumbersOfGpusUsableByYarn()
+ public synchronized List<GpuDevice> getGpusUsableByYarn()
throws YarnException {
validateConfOrThrowException();
@@ -149,7 +149,7 @@ public class GpuDiscoverer {
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
- List<Integer> minorNumbers = new ArrayList<>();
+ List<GpuDevice> gpuDevices = new ArrayList<>();
if (allowedDevicesStr.equals(
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
@@ -167,21 +167,31 @@ public class GpuDiscoverer {
}
if (lastDiscoveredGpuInformation.getGpus() != null) {
- for (PerGpuDeviceInformation gpu : lastDiscoveredGpuInformation
- .getGpus()) {
- minorNumbers.add(gpu.getMinorNumber());
+ for (int i = 0; i < lastDiscoveredGpuInformation.getGpus().size();
+ i++) {
+ List<PerGpuDeviceInformation> gpuInfos =
+ lastDiscoveredGpuInformation.getGpus();
+ gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber()));
}
}
} else{
for (String s : allowedDevicesStr.split(",")) {
if (s.trim().length() > 0) {
- minorNumbers.add(Integer.valueOf(s.trim()));
+ String[] kv = s.trim().split(":");
+ if (kv.length != 2) {
+ throw new YarnException(
+ "Illegal format, it should be index:minor_number format, now it="
+ + s);
+ }
+
+ gpuDevices.add(
+ new GpuDevice(Integer.parseInt(kv[0]), Integer.parseInt(kv[1])));
}
}
- LOG.info("Allowed GPU devices with minor numbers:" + allowedDevicesStr);
+ LOG.info("Allowed GPU devices:" + gpuDevices);
}
- return minorNumbers;
+ return gpuDevices;
}
public synchronized void initialize(Configuration conf) throws YarnException {
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
index f6bf506..796eb25 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
@@ -40,12 +40,14 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
public void updateConfiguredResource(Resource res) throws YarnException {
LOG.info("Initializing configured GPU resources for the NodeManager.");
- List<Integer> usableGpus =
- GpuDiscoverer.getInstance().getMinorNumbersOfGpusUsableByYarn();
+ List<GpuDevice> usableGpus =
+ GpuDiscoverer.getInstance().getGpusUsableByYarn();
if (null == usableGpus || usableGpus.isEmpty()) {
- LOG.info("Didn't find any usable GPUs on the NodeManager.");
+ String message = "GPU is enabled, but couldn't find any usable GPUs on the "
+ + "NodeManager.";
+ LOG.error(message);
// No gpu can be used by YARN.
- return;
+ throw new YarnException(message);
}
long nUsableGpus = usableGpus.size();
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
index 2261db0..374cc29 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
@@ -18,27 +18,9 @@
package org.apache.hadoop.yarn.server.nodemanager.recovery;
-import static org.fusesource.leveldbjni.JniDBFactory.asString;
-import static org.fusesource.leveldbjni.JniDBFactory.bytes;
-
-import org.apache.hadoop.yarn.api.records.Token;
-import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Timer;
-import java.util.TimerTask;
-import java.util.Set;
-
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.ListMultimap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -50,9 +32,11 @@ import org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestP
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.Token;
import org.apache.hadoop.yarn.api.records.impl.pb.ResourcePBImpl;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.proto.YarnProtos.LocalResourceProto;
+import org.apache.hadoop.yarn.proto.YarnSecurityTokenProtos.ContainerTokenIdentifierProto;
import org.apache.hadoop.yarn.proto.YarnServerCommonProtos.MasterKeyProto;
import org.apache.hadoop.yarn.proto.YarnServerCommonProtos.VersionProto;
import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.ContainerManagerApplicationProto;
@@ -60,9 +44,10 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.Deletion
import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LocalizedResourceProto;
import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDeleterProto;
import org.apache.hadoop.yarn.proto.YarnServiceProtos.StartContainerRequestProto;
-import org.apache.hadoop.yarn.proto.YarnSecurityTokenProtos.ContainerTokenIdentifierProto;
+import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.records.MasterKey;
import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
import org.apache.hadoop.yarn.server.records.Version;
import org.apache.hadoop.yarn.server.records.impl.pb.VersionPBImpl;
@@ -75,10 +60,26 @@ import org.iq80.leveldb.DB;
import org.iq80.leveldb.DBException;
import org.iq80.leveldb.Options;
import org.iq80.leveldb.WriteBatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.Timer;
+import java.util.TimerTask;
+
+import static org.fusesource.leveldbjni.JniDBFactory.asString;
+import static org.fusesource.leveldbjni.JniDBFactory.bytes;
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.ArrayListMultimap;
-import com.google.common.collect.ListMultimap;
public class NMLeveldbStateStoreService extends NMStateStoreService {
@@ -1184,15 +1185,18 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
}
@Override
- public void storeAssignedResources(ContainerId containerId,
+ public void storeAssignedResources(Container container,
String resourceType, List<Serializable> assignedResources)
throws IOException {
if (LOG.isDebugEnabled()) {
- LOG.debug("storeAssignedResources: containerId=" + containerId
- + ", assignedResources=" + StringUtils.join(",", assignedResources));
+ LOG.debug(
+ "storeAssignedResources: containerId=" + container.getContainerId()
+ + ", assignedResources=" + StringUtils
+ .join(",", assignedResources));
+
}
- String keyResChng = CONTAINERS_KEY_PREFIX + containerId.toString()
+ String keyResChng = CONTAINERS_KEY_PREFIX + container.getContainerId().toString()
+ CONTAINER_ASSIGNED_RESOURCES_KEY_SUFFIX + resourceType;
try {
WriteBatch batch = db.createWriteBatch();
@@ -1210,6 +1214,9 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
} catch (DBException e) {
throw new IOException(e);
}
+
+ // update container resource mapping.
+ updateContainerResourceMapping(container, resourceType, assignedResources);
}
@SuppressWarnings("deprecation")
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
index 6e3707b..7d1010f 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
@@ -35,6 +35,7 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.Localize
import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDeleterProto;
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.records.MasterKey;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
// The state store to use when state isn't being stored
public class NMNullStateStoreService extends NMStateStoreService {
@@ -268,7 +269,7 @@ public class NMNullStateStoreService extends NMStateStoreService {
}
@Override
- public void storeAssignedResources(ContainerId containerId,
+ public void storeAssignedResources(Container container,
String resourceType, List<Serializable> assignedResources)
throws IOException {
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
index a929fe2..350f242 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
@@ -44,6 +44,7 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.Localize
import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDeleterProto;
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.records.MasterKey;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
@Private
@@ -732,12 +733,12 @@ public abstract class NMStateStoreService extends AbstractService {
/**
* Store the assigned resources to a container.
*
- * @param containerId Container Id
+ * @param container NMContainer
* @param resourceType Resource Type
* @param assignedResources Assigned resources
* @throws IOException if fails
*/
- public abstract void storeAssignedResources(ContainerId containerId,
+ public abstract void storeAssignedResources(Container container,
String resourceType, List<Serializable> assignedResources)
throws IOException;
@@ -746,4 +747,14 @@ public abstract class NMStateStoreService extends AbstractService {
protected abstract void startStorage() throws IOException;
protected abstract void closeStorage() throws IOException;
+
+ protected void updateContainerResourceMapping(Container container,
+ String resourceType, List<Serializable> assignedResources) {
+ // Update Container#getResourceMapping.
+ ResourceMappings.AssignedResources newAssigned =
+ new ResourceMappings.AssignedResources();
+ newAssigned.updateAssignedResources(assignedResources);
+ container.getResourceMappings().addAssignedResources(resourceType,
+ newAssigned);
+ }
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
index 6c0c6de..1e919af 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
@@ -519,16 +519,18 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
commonLaunchContainer(appId, cid, cm);
+ Container nmContainer = context.getContainers().get(cid);
+
Application app = context.getApplications().get(appId);
assertNotNull(app);
// store resource mapping of the container
List<Serializable> gpuResources = Arrays.asList("1", "2", "3");
- stateStore.storeAssignedResources(cid, "gpu", gpuResources);
+ stateStore.storeAssignedResources(nmContainer, "gpu", gpuResources);
List<Serializable> numaResources = Arrays.asList("numa1");
- stateStore.storeAssignedResources(cid, "numa", numaResources);
+ stateStore.storeAssignedResources(nmContainer, "numa", numaResources);
List<Serializable> fpgaResources = Arrays.asList("fpga1", "fpga2");
- stateStore.storeAssignedResources(cid, "fpga", fpgaResources);
+ stateStore.storeAssignedResources(nmContainer, "fpga", fpgaResources);
cm.stop();
context = createContext(conf, stateStore);
@@ -540,7 +542,6 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
app = context.getApplications().get(appId);
assertNotNull(app);
- Container nmContainer = context.getContainers().get(cid);
Assert.assertNotNull(nmContainer);
ResourceMappings resourceMappings = nmContainer.getResourceMappings();
List<Serializable> assignedResource = resourceMappings
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
index 5c70f7a..7caa0e8 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
@@ -20,7 +20,6 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resourc
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
@@ -36,15 +35,17 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileg
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
-import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.apache.hadoop.yarn.util.resource.TestResourceUtils;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
@@ -90,7 +91,7 @@ public class TestGpuResourceHandler {
@Test
public void testBootStrap() throws Exception {
Configuration conf = new YarnConfiguration();
- conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0");
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
GpuDiscoverer.getInstance().initialize(conf);
@@ -104,8 +105,8 @@ public class TestGpuResourceHandler {
.newInstance(ApplicationId.newInstance(1234L, 1), 1), id);
}
- private static Container mockContainerWithGpuRequest(int id,
- int numGpuRequest) {
+ private static Container mockContainerWithGpuRequest(int id, int numGpuRequest,
+ boolean dockerContainerEnabled) {
Container c = mock(Container.class);
when(c.getContainerId()).thenReturn(getContainerId(id));
@@ -115,29 +116,46 @@ public class TestGpuResourceHandler {
res.setResourceValue(ResourceInformation.GPU_URI, numGpuRequest);
when(c.getResource()).thenReturn(res);
when(c.getResourceMappings()).thenReturn(resMapping);
+
+ ContainerLaunchContext clc = mock(ContainerLaunchContext.class);
+ Map<String, String> env = new HashMap<>();
+ if (dockerContainerEnabled) {
+ env.put(ContainerRuntimeConstants.ENV_CONTAINER_TYPE, "docker");
+ }
+ when(clc.getEnvironment()).thenReturn(env);
+ when(c.getLaunchContext()).thenReturn(clc);
return c;
}
+ private static Container mockContainerWithGpuRequest(int id,
+ int numGpuRequest) {
+ return mockContainerWithGpuRequest(id, numGpuRequest, false);
+ }
+
private void verifyDeniedDevices(ContainerId containerId,
- List<Integer> deniedDevices)
+ List<GpuDevice> deniedDevices)
throws ResourceHandlerException, PrivilegedOperationException {
verify(mockCGroupsHandler, times(1)).createCGroup(
CGroupsHandler.CGroupController.DEVICES, containerId.toString());
if (null != deniedDevices && !deniedDevices.isEmpty()) {
+ List<Integer> deniedDevicesMinorNumber = new ArrayList<>();
+ for (GpuDevice deniedDevice : deniedDevices) {
+ deniedDevicesMinorNumber.add(deniedDevice.getMinorNumber());
+ }
verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation(
new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays
.asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION,
containerId.toString(),
GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION,
- StringUtils.join(",", deniedDevices))), true);
+ StringUtils.join(",", deniedDevicesMinorNumber))), true);
}
}
- @Test
- public void testAllocation() throws Exception {
+ private void commonTestAllocation(boolean dockerContainerEnabled)
+ throws Exception {
Configuration conf = new YarnConfiguration();
- conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf);
gpuResourceHandler.bootstrap(conf);
@@ -145,31 +163,52 @@ public class TestGpuResourceHandler {
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
/* Start container 1, asks 3 containers */
- gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
+ gpuResourceHandler.preStart(
+ mockContainerWithGpuRequest(1, 3, dockerContainerEnabled));
// Only device=4 will be blocked.
- verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
+ if (dockerContainerEnabled) {
+ verifyDeniedDevices(getContainerId(1), Collections.emptyList());
+ } else{
+ verifyDeniedDevices(getContainerId(1), Arrays.asList(new GpuDevice(3,4)));
+ }
/* Start container 2, asks 2 containers. Excepted to fail */
boolean failedToAllocate = false;
try {
- gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 2));
+ gpuResourceHandler.preStart(
+ mockContainerWithGpuRequest(2, 2, dockerContainerEnabled));
} catch (ResourceHandlerException e) {
failedToAllocate = true;
}
Assert.assertTrue(failedToAllocate);
/* Start container 3, ask 1 container, succeeded */
- gpuResourceHandler.preStart(mockContainerWithGpuRequest(3, 1));
+ gpuResourceHandler.preStart(
+ mockContainerWithGpuRequest(3, 1, dockerContainerEnabled));
// devices = 0/1/3 will be blocked
- verifyDeniedDevices(getContainerId(3), Arrays.asList(0, 1, 3));
+ if (dockerContainerEnabled) {
+ verifyDeniedDevices(getContainerId(3), Collections.<GpuDevice>emptyList());
+ } else {
+ verifyDeniedDevices(getContainerId(3), Arrays
+ .asList(new GpuDevice(0, 0), new GpuDevice(1, 1),
+ new GpuDevice(2, 3)));
+ }
- /* Start container 4, ask 0 container, succeeded */
- gpuResourceHandler.preStart(mockContainerWithGpuRequest(4, 0));
- // All devices will be blocked
- verifyDeniedDevices(getContainerId(4), Arrays.asList(0, 1, 3, 4));
+ /* Start container 4, ask 0 container, succeeded */
+ gpuResourceHandler.preStart(
+ mockContainerWithGpuRequest(4, 0, dockerContainerEnabled));
+
+ if (dockerContainerEnabled) {
+ verifyDeniedDevices(getContainerId(4), Collections.<GpuDevice>emptyList());
+ } else{
+ // All devices will be blocked
+ verifyDeniedDevices(getContainerId(4), Arrays
+ .asList(new GpuDevice(0, 0), new GpuDevice(1, 1), new GpuDevice(2, 3),
+ new GpuDevice(3, 4)));
+ }
/* Release container-1, expect cgroups deleted */
gpuResourceHandler.postComplete(getContainerId(1));
@@ -188,12 +227,24 @@ public class TestGpuResourceHandler {
gpuResourceHandler.getGpuAllocator().getAvailableGpus());
}
+ @Test
+ public void testAllocationWhenDockerContainerEnabled() throws Exception {
+ // When docker container is enabled, no devices should be written to
+ // devices.deny.
+ commonTestAllocation(true);
+ }
+
+ @Test
+ public void testAllocation() throws Exception {
+ commonTestAllocation(false);
+ }
+
@SuppressWarnings("unchecked")
@Test
public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
throws Exception {
Configuration conf = new YarnConfiguration();
- conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf);
gpuResourceHandler.bootstrap(conf);
@@ -202,7 +253,7 @@ public class TestGpuResourceHandler {
doThrow(new IOException("Exception ...")).when(mockNMStateStore)
.storeAssignedResources(
- any(ContainerId.class), anyString(), anyList());
+ any(Container.class), anyString(), anyList());
boolean exception = false;
/* Start container 1, asks 3 containers */
@@ -225,9 +276,12 @@ public class TestGpuResourceHandler {
conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
GpuDiscoverer.getInstance().initialize(conf);
- gpuResourceHandler.bootstrap(conf);
- Assert.assertEquals(0,
- gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+ try {
+ gpuResourceHandler.bootstrap(conf);
+ Assert.fail("Should fail because no GPU available");
+ } catch (ResourceHandlerException e) {
+ // Expected because of no resource available
+ }
/* Start container 1, asks 0 containers */
gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0));
@@ -254,7 +308,7 @@ public class TestGpuResourceHandler {
@Test
public void testAllocationStored() throws Exception {
Configuration conf = new YarnConfiguration();
- conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf);
gpuResourceHandler.bootstrap(conf);
@@ -265,33 +319,33 @@ public class TestGpuResourceHandler {
Container container = mockContainerWithGpuRequest(1, 3);
gpuResourceHandler.preStart(container);
- verify(mockNMStateStore).storeAssignedResources(getContainerId(1),
- ResourceInformation.GPU_URI,
- Arrays.asList("0", "1", "3"));
-
- Assert.assertEquals(3, container.getResourceMappings()
- .getAssignedResources(ResourceInformation.GPU_URI).size());
+ verify(mockNMStateStore).storeAssignedResources(container,
+ ResourceInformation.GPU_URI, Arrays
+ .asList(new GpuDevice(0, 0), new GpuDevice(1, 1),
+ new GpuDevice(2, 3)));
// Only device=4 will be blocked.
- verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
+ verifyDeniedDevices(getContainerId(1), Arrays.asList(new GpuDevice(3, 4)));
/* Start container 2, ask 0 container, succeeded */
container = mockContainerWithGpuRequest(2, 0);
gpuResourceHandler.preStart(container);
- verifyDeniedDevices(getContainerId(2), Arrays.asList(0, 1, 3, 4));
+ verifyDeniedDevices(getContainerId(2), Arrays
+ .asList(new GpuDevice(0, 0), new GpuDevice(1, 1), new GpuDevice(2, 3),
+ new GpuDevice(3, 4)));
Assert.assertEquals(0, container.getResourceMappings()
.getAssignedResources(ResourceInformation.GPU_URI).size());
// Store assigned resource will not be invoked.
verify(mockNMStateStore, never()).storeAssignedResources(
- eq(getContainerId(2)), eq(ResourceInformation.GPU_URI), anyList());
+ eq(container), eq(ResourceInformation.GPU_URI), anyList());
}
@Test
public void testRecoverResourceAllocation() throws Exception {
Configuration conf = new YarnConfiguration();
- conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
GpuDiscoverer.getInstance().initialize(conf);
gpuResourceHandler.bootstrap(conf);
@@ -302,7 +356,8 @@ public class TestGpuResourceHandler {
ResourceMappings rmap = new ResourceMappings();
ResourceMappings.AssignedResources ar =
new ResourceMappings.AssignedResources();
- ar.updateAssignedResources(Arrays.asList("1", "3"));
+ ar.updateAssignedResources(
+ Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3)));
rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
when(nmContainer.getResourceMappings()).thenReturn(rmap);
@@ -312,12 +367,15 @@ public class TestGpuResourceHandler {
// Reacquire container restore state of GPU Resource Allocator.
gpuResourceHandler.reacquireContainer(getContainerId(1));
- Map<Integer, ContainerId> deviceAllocationMapping =
+ Map<GpuDevice, ContainerId> deviceAllocationMapping =
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
Assert.assertEquals(2, deviceAllocationMapping.size());
Assert.assertTrue(
- deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
- Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+ deviceAllocationMapping.keySet().contains(new GpuDevice(1, 1)));
+ Assert.assertTrue(
+ deviceAllocationMapping.keySet().contains(new GpuDevice(2, 3)));
+ Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
+ getContainerId(1));
// TEST CASE
// Try to reacquire a container but requested device is not in allowed list.
@@ -325,7 +383,8 @@ public class TestGpuResourceHandler {
rmap = new ResourceMappings();
ar = new ResourceMappings.AssignedResources();
// id=5 is not in allowed list.
- ar.updateAssignedResources(Arrays.asList("4", "5"));
+ ar.updateAssignedResources(
+ Arrays.asList(new GpuDevice(3, 4), new GpuDevice(4, 5)));
rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
when(nmContainer.getResourceMappings()).thenReturn(rmap);
@@ -345,9 +404,10 @@ public class TestGpuResourceHandler {
deviceAllocationMapping =
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
Assert.assertEquals(2, deviceAllocationMapping.size());
- Assert.assertTrue(
- deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
- Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+ Assert.assertTrue(deviceAllocationMapping.keySet()
+ .containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3))));
+ Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
+ getContainerId(1));
// TEST CASE
// Try to reacquire a container but requested device is already assigned.
@@ -355,7 +415,8 @@ public class TestGpuResourceHandler {
rmap = new ResourceMappings();
ar = new ResourceMappings.AssignedResources();
// id=3 is already assigned
- ar.updateAssignedResources(Arrays.asList("4", "3"));
+ ar.updateAssignedResources(
+ Arrays.asList(new GpuDevice(3, 4), new GpuDevice(2, 3)));
rmap.addAssignedResources("gpu", ar);
when(nmContainer.getResourceMappings()).thenReturn(rmap);
@@ -375,8 +436,9 @@ public class TestGpuResourceHandler {
deviceAllocationMapping =
gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
Assert.assertEquals(2, deviceAllocationMapping.size());
- Assert.assertTrue(
- deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
- Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+ Assert.assertTrue(deviceAllocationMapping.keySet()
+ .containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3))));
+ Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)),
+ getContainerId(1));
}
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
index 83bace2..4abb633 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
@@ -101,23 +101,41 @@ public class TestGpuDiscoverer {
GpuDeviceInformation info = plugin.getGpuDeviceInformation();
Assert.assertTrue(info.getGpus().size() > 0);
- Assert.assertEquals(plugin.getMinorNumbersOfGpusUsableByYarn().size(),
+ Assert.assertEquals(plugin.getGpusUsableByYarn().size(),
info.getGpus().size());
}
@Test
public void getNumberOfUsableGpusFromConfig() throws YarnException {
Configuration conf = new Configuration(false);
- conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,2,4");
+
+ // Illegal format
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3");
GpuDiscoverer plugin = new GpuDiscoverer();
+ try {
+ plugin.initialize(conf);
+ plugin.getGpusUsableByYarn();
+ Assert.fail("Illegal format, should fail.");
+ } catch (YarnException e) {
+ // Expected
+ }
+
+ // Valid format
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3:4");
+ plugin = new GpuDiscoverer();
plugin.initialize(conf);
- List<Integer> minorNumbers = plugin.getMinorNumbersOfGpusUsableByYarn();
- Assert.assertEquals(4, minorNumbers.size());
+ List<GpuDevice> usableGpuDevices = plugin.getGpusUsableByYarn();
+ Assert.assertEquals(4, usableGpuDevices.size());
+
+ Assert.assertTrue(0 == usableGpuDevices.get(0).getIndex());
+ Assert.assertTrue(1 == usableGpuDevices.get(1).getIndex());
+ Assert.assertTrue(2 == usableGpuDevices.get(2).getIndex());
+ Assert.assertTrue(3 == usableGpuDevices.get(3).getIndex());
- Assert.assertTrue(0 == minorNumbers.get(0));
- Assert.assertTrue(1 == minorNumbers.get(1));
- Assert.assertTrue(2 == minorNumbers.get(2));
- Assert.assertTrue(4 == minorNumbers.get(3));
+ Assert.assertTrue(0 == usableGpuDevices.get(0).getMinorNumber());
+ Assert.assertTrue(1 == usableGpuDevices.get(1).getMinorNumber());
+ Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber());
+ Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber());
}
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
index 5d424ad..4364709 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
@@ -43,6 +43,7 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDelet
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.records.MasterKey;
import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
@@ -515,14 +516,17 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
}
@Override
- public void storeAssignedResources(ContainerId containerId,
+ public void storeAssignedResources(Container container,
String resourceType, List<Serializable> assignedResources)
throws IOException {
ResourceMappings.AssignedResources ar =
new ResourceMappings.AssignedResources();
ar.updateAssignedResources(assignedResources);
- containerStates.get(containerId).getResourceMappings()
+ containerStates.get(container.getContainerId()).getResourceMappings()
.addAssignedResources(resourceType, ar);
+
+ // update container resource mapping.
+ updateContainerResourceMapping(container, resourceType, assignedResources);
}
private static class TrackerState {
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
index bc17868..f93f3e6 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
@@ -32,6 +32,7 @@ import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.timeout;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
import java.io.File;
import java.io.IOException;
@@ -72,6 +73,8 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDelet
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.records.MasterKey;
import org.apache.hadoop.yarn.server.nodemanager.amrmproxy.AMRMProxyTokenSecretManager;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.LocalResourceTrackerState;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredAMRMProxyState;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredApplicationsState;
@@ -1151,16 +1154,21 @@ public class TestNMLeveldbStateStoreService {
ContainerId containerId = ContainerId.newContainerId(appAttemptId, 5);
storeMockContainer(containerId);
+ Container container = mock(Container.class);
+ when(container.getContainerId()).thenReturn(containerId);
+ ResourceMappings resourceMappings = new ResourceMappings();
+ when(container.getResourceMappings()).thenReturn(resourceMappings);
+
// Store ResourceMapping
- stateStore.storeAssignedResources(containerId, "gpu",
+ stateStore.storeAssignedResources(container, "gpu",
Arrays.asList("1", "2", "3"));
// This will overwrite above
List<Serializable> gpuRes1 = Arrays.asList("1", "2", "4");
- stateStore.storeAssignedResources(containerId, "gpu", gpuRes1);
+ stateStore.storeAssignedResources(container, "gpu", gpuRes1);
List<Serializable> fpgaRes = Arrays.asList("3", "4", "5", "6");
- stateStore.storeAssignedResources(containerId, "fpga", fpgaRes);
+ stateStore.storeAssignedResources(container, "fpga", fpgaRes);
List<Serializable> numaRes = Arrays.asList("numa1");
- stateStore.storeAssignedResources(containerId, "numa", numaRes);
+ stateStore.storeAssignedResources(container, "numa", numaRes);
// add a invalid key
restartStateStore();
@@ -1170,12 +1178,18 @@ public class TestNMLeveldbStateStoreService {
List<Serializable> res = rcs.getResourceMappings()
.getAssignedResources("gpu");
Assert.assertTrue(res.equals(gpuRes1));
+ Assert.assertTrue(
+ resourceMappings.getAssignedResources("gpu").equals(gpuRes1));
res = rcs.getResourceMappings().getAssignedResources("fpga");
Assert.assertTrue(res.equals(fpgaRes));
+ Assert.assertTrue(
+ resourceMappings.getAssignedResources("fpga").equals(fpgaRes));
res = rcs.getResourceMappings().getAssignedResources("numa");
Assert.assertTrue(res.equals(numaRes));
+ Assert.assertTrue(
+ resourceMappings.getAssignedResources("numa").equals(numaRes));
}
private StartContainerRequest storeMockContainer(ContainerId containerId)
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org