You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by zh...@apache.org on 2018/05/09 17:30:59 UTC
[40/50] [abbrv] hadoop git commit: YARN-7137. [YARN-3926] Move newly
added APIs to unstable in YARN-3926 branch. Contributed by Wangda Tan.
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
new file mode 100644
index 0000000..61b8ce5
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
@@ -0,0 +1,254 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableSet;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.Shell;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public class GpuDiscoverer {
+ public static final Logger LOG = LoggerFactory.getLogger(
+ GpuDiscoverer.class);
+ @VisibleForTesting
+ protected static final String DEFAULT_BINARY_NAME = "nvidia-smi";
+
+ // When executable path not set, try to search default dirs
+ // By default search /usr/bin, /bin, and /usr/local/nvidia/bin (when
+ // launched by nvidia-docker.
+ private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
+ "/usr/bin", "/bin", "/usr/local/nvidia/bin");
+
+ // command should not run more than 10 sec.
+ private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
+ private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
+ private static GpuDiscoverer instance;
+
+ static {
+ instance = new GpuDiscoverer();
+ }
+
+ private Configuration conf = null;
+ private String pathOfGpuBinary = null;
+ private Map<String, String> environment = new HashMap<>();
+ private GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
+
+ private int numOfErrorExecutionSinceLastSucceed = 0;
+ GpuDeviceInformation lastDiscoveredGpuInformation = null;
+
+ private void validateConfOrThrowException() throws YarnException {
+ if (conf == null) {
+ throw new YarnException("Please initialize (call initialize) before use "
+ + GpuDiscoverer.class.getSimpleName());
+ }
+ }
+
+ /**
+ * Get GPU device information from system.
+ * This need to be called after initialize.
+ *
+ * Please note that this only works on *NIX platform, so external caller
+ * need to make sure this.
+ *
+ * @return GpuDeviceInformation
+ * @throws YarnException when any error happens
+ */
+ public synchronized GpuDeviceInformation getGpuDeviceInformation()
+ throws YarnException {
+ validateConfOrThrowException();
+
+ if (null == pathOfGpuBinary) {
+ throw new YarnException(
+ "Failed to find GPU discovery executable, please double check "
+ + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting.");
+ }
+
+ if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
+ String msg =
+ "Failed to execute GPU device information detection script for "
+ + MAX_REPEATED_ERROR_ALLOWED
+ + " times, skip following executions.";
+ LOG.error(msg);
+ throw new YarnException(msg);
+ }
+
+ String output;
+ try {
+ output = Shell.execCommand(environment,
+ new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS);
+ GpuDeviceInformation info = parser.parseXml(output);
+ numOfErrorExecutionSinceLastSucceed = 0;
+ lastDiscoveredGpuInformation = info;
+ return info;
+ } catch (IOException e) {
+ numOfErrorExecutionSinceLastSucceed++;
+ String msg =
+ "Failed to execute " + pathOfGpuBinary + " exception message:" + e
+ .getMessage() + ", continue ...";
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(msg);
+ }
+ throw new YarnException(e);
+ } catch (YarnException e) {
+ numOfErrorExecutionSinceLastSucceed++;
+ String msg = "Failed to parse xml output" + e.getMessage();
+ if (LOG.isDebugEnabled()) {
+ LOG.warn(msg, e);
+ }
+ throw e;
+ }
+ }
+
+ /**
+ * Get list of minor device numbers of Gpu devices usable by YARN.
+ *
+ * @return List of minor device numbers of Gpu devices.
+ * @throws YarnException when any issue happens
+ */
+ public synchronized List<Integer> getMinorNumbersOfGpusUsableByYarn()
+ throws YarnException {
+ validateConfOrThrowException();
+
+ String allowedDevicesStr = conf.get(
+ YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
+ YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
+
+ List<Integer> minorNumbers = new ArrayList<>();
+
+ if (allowedDevicesStr.equals(
+ YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
+ // Get gpu device information from system.
+ if (null == lastDiscoveredGpuInformation) {
+ String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to "
+ + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES
+ + ", however automatically discovering "
+ + "GPU information failed, please check NodeManager log for more"
+ + " details, as an alternative, admin can specify "
+ + YarnConfiguration.NM_GPU_ALLOWED_DEVICES
+ + " manually to enable GPU isolation.";
+ LOG.error(msg);
+ throw new YarnException(msg);
+ }
+
+ if (lastDiscoveredGpuInformation.getGpus() != null) {
+ for (PerGpuDeviceInformation gpu : lastDiscoveredGpuInformation
+ .getGpus()) {
+ minorNumbers.add(gpu.getMinorNumber());
+ }
+ }
+ } else{
+ for (String s : allowedDevicesStr.split(",")) {
+ if (s.trim().length() > 0) {
+ minorNumbers.add(Integer.valueOf(s.trim()));
+ }
+ }
+ LOG.info("Allowed GPU devices with minor numbers:" + allowedDevicesStr);
+ }
+
+ return minorNumbers;
+ }
+
+ public synchronized void initialize(Configuration conf) throws YarnException {
+ this.conf = conf;
+ numOfErrorExecutionSinceLastSucceed = 0;
+ String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC,
+ YarnConfiguration.DEFAULT_NM_GPU_PATH_TO_EXEC);
+ if (pathToExecutable.isEmpty()) {
+ pathToExecutable = DEFAULT_BINARY_NAME;
+ }
+
+ // Validate file existence
+ File binaryPath = new File(pathToExecutable);
+
+ if (!binaryPath.exists()) {
+ // When binary not exist, use default setting.
+ boolean found = false;
+ for (String dir : DEFAULT_BINARY_SEARCH_DIRS) {
+ binaryPath = new File(dir, DEFAULT_BINARY_NAME);
+ if (binaryPath.exists()) {
+ found = true;
+ pathOfGpuBinary = binaryPath.getAbsolutePath();
+ break;
+ }
+ }
+
+ if (!found) {
+ LOG.warn("Failed to locate binary at:" + binaryPath.getAbsolutePath()
+ + ", please double check [" + YarnConfiguration.NM_GPU_PATH_TO_EXEC
+ + "] setting. Now use " + "default binary:" + DEFAULT_BINARY_NAME);
+ }
+ } else{
+ // If path specified by user is a directory, use
+ if (binaryPath.isDirectory()) {
+ binaryPath = new File(binaryPath, DEFAULT_BINARY_NAME);
+ LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME
+ + " under the directory, updated path-to-executable:" + binaryPath
+ .getAbsolutePath());
+ }
+ // Validated
+ pathOfGpuBinary = binaryPath.getAbsolutePath();
+ }
+
+ // Try to discover GPU information once and print
+ try {
+ LOG.info("Trying to discover GPU information ...");
+ GpuDeviceInformation info = getGpuDeviceInformation();
+ LOG.info(info.toString());
+ } catch (YarnException e) {
+ String msg =
+ "Failed to discover GPU information from system, exception message:"
+ + e.getMessage() + " continue...";
+ LOG.warn(msg);
+ }
+ }
+
+ @VisibleForTesting
+ protected Map<String, String> getEnvironmentToRunCommand() {
+ return environment;
+ }
+
+ @VisibleForTesting
+ protected String getPathOfGpuBinary() {
+ return pathOfGpuBinary;
+ }
+
+ public static GpuDiscoverer getInstance() {
+ return instance;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
new file mode 100644
index 0000000..f6bf506
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
+import org.apache.hadoop.yarn.util.resource.ResourceUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.List;
+import java.util.Map;
+
+import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI;
+
+public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
+ private static final Logger LOG =
+ LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
+
+ @Override
+ public void updateConfiguredResource(Resource res) throws YarnException {
+ LOG.info("Initializing configured GPU resources for the NodeManager.");
+
+ List<Integer> usableGpus =
+ GpuDiscoverer.getInstance().getMinorNumbersOfGpusUsableByYarn();
+ if (null == usableGpus || usableGpus.isEmpty()) {
+ LOG.info("Didn't find any usable GPUs on the NodeManager.");
+ // No gpu can be used by YARN.
+ return;
+ }
+
+ long nUsableGpus = usableGpus.size();
+
+ Map<String, ResourceInformation> configuredResourceTypes =
+ ResourceUtils.getResourceTypes();
+ if (!configuredResourceTypes.containsKey(GPU_URI)) {
+ throw new YarnException("Found " + nUsableGpus + " usable GPUs, however "
+ + GPU_URI
+ + " resource-type is not configured inside"
+ + " resource-types.xml, please configure it to enable GPU feature or"
+ + " remove " + GPU_URI + " from "
+ + YarnConfiguration.NM_RESOURCE_PLUGINS);
+ }
+
+ res.setResourceValue(GPU_URI, nUsableGpus);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
new file mode 100644
index 0000000..9576ce7
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceHandlerImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
+
+public class GpuResourcePlugin implements ResourcePlugin {
+ private ResourceHandler gpuResourceHandler = null;
+ private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null;
+
+ @Override
+ public synchronized void initialize(Context context) throws YarnException {
+ resourceDiscoverHandler = new GpuNodeResourceUpdateHandler();
+ GpuDiscoverer.getInstance().initialize(context.getConf());
+ }
+
+ @Override
+ public synchronized ResourceHandler createResourceHandler(
+ Context context, CGroupsHandler cGroupsHandler,
+ PrivilegedOperationExecutor privilegedOperationExecutor) {
+ if (gpuResourceHandler == null) {
+ gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler,
+ privilegedOperationExecutor);
+ }
+
+ return gpuResourceHandler;
+ }
+
+ @Override
+ public synchronized NodeResourceUpdaterPlugin getNodeResourceHandlerInstance() {
+ return resourceDiscoverHandler;
+ }
+
+ @Override
+ public void cleanup() throws YarnException {
+ // Do nothing.
+ }
+}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java
new file mode 100644
index 0000000..977032a
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlRootElement;
+import java.util.List;
+
+/**
+ * All GPU Device Information in the system.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "nvidia_smi_log")
+public class GpuDeviceInformation {
+ List<PerGpuDeviceInformation> gpus;
+
+ String driverVersion = "N/A";
+
+ // More fields like topology information could be added when needed.
+ // ...
+
+ @javax.xml.bind.annotation.XmlElement(name = "gpu")
+ public List<PerGpuDeviceInformation> getGpus() {
+ return gpus;
+ }
+
+ public void setGpus(List<PerGpuDeviceInformation> gpus) {
+ this.gpus = gpus;
+ }
+
+ @javax.xml.bind.annotation.XmlElement(name = "driver_version")
+ public String getDriverVersion() {
+ return driverVersion;
+ }
+
+ public void setDriverVersion(String driverVersion) {
+ this.driverVersion = driverVersion;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("=== Gpus in the system ===\n").append("\tDriver Version:").append(
+ getDriverVersion()).append("\n");
+
+ if (gpus != null) {
+ for (PerGpuDeviceInformation gpu : gpus) {
+ sb.append("\t").append(gpu.toString()).append("\n");
+ }
+ }
+ return sb.toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
new file mode 100644
index 0000000..1bd92f6
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+import javax.xml.bind.JAXBContext;
+import javax.xml.bind.JAXBException;
+import javax.xml.bind.Unmarshaller;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParserFactory;
+import javax.xml.transform.sax.SAXSource;
+import java.io.StringReader;
+
+/**
+ * Parse XML and get GPU device information
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public class GpuDeviceInformationParser {
+ private static final Logger LOG = LoggerFactory.getLogger(
+ GpuDeviceInformationParser.class);
+
+ private Unmarshaller unmarshaller = null;
+ private XMLReader xmlReader = null;
+
+ private void init()
+ throws SAXException, ParserConfigurationException, JAXBException {
+ SAXParserFactory spf = SAXParserFactory.newInstance();
+ // Disable external-dtd since by default nvidia-smi output contains
+ // <!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v8.dtd"> in header
+ spf.setFeature(
+ "http://apache.org/xml/features/nonvalidating/load-external-dtd",
+ false);
+ spf.setFeature("http://xml.org/sax/features/validation", false);
+
+ JAXBContext jaxbContext = JAXBContext.newInstance(
+ GpuDeviceInformation.class);
+
+ this.xmlReader = spf.newSAXParser().getXMLReader();
+ this.unmarshaller = jaxbContext.createUnmarshaller();
+ }
+
+ public synchronized GpuDeviceInformation parseXml(String xmlContent)
+ throws YarnException {
+ if (unmarshaller == null) {
+ try {
+ init();
+ } catch (SAXException | ParserConfigurationException | JAXBException e) {
+ LOG.error("Exception while initialize parser", e);
+ throw new YarnException(e);
+ }
+ }
+
+ InputSource inputSource = new InputSource(new StringReader(xmlContent));
+ SAXSource source = new SAXSource(xmlReader, inputSource);
+ try {
+ return (GpuDeviceInformation) unmarshaller.unmarshal(source);
+ } catch (JAXBException e) {
+ LOG.error("Exception while parsing xml", e);
+ throw new YarnException(e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java
new file mode 100644
index 0000000..f315313
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java
@@ -0,0 +1,165 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlAdapter;
+
+/**
+ * Capture single GPU device information such as memory size, temperature,
+ * utilization.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "gpu")
+public class PerGpuDeviceInformation {
+
+ private String productName = "N/A";
+ private String uuid = "N/A";
+ private int minorNumber = -1;
+
+ private PerGpuUtilizations gpuUtilizations;
+ private PerGpuMemoryUsage gpuMemoryUsage;
+ private PerGpuTemperature temperature;
+
+ /**
+ * Convert formats like "34 C", "75.6 %" to float.
+ */
+ @InterfaceAudience.Private
+ @InterfaceStability.Unstable
+ static class StrToFloatBeforeSpaceAdapter extends
+ XmlAdapter<String, Float> {
+ @Override
+ public String marshal(Float v) throws Exception {
+ if (v == null) {
+ return "";
+ }
+ return String.valueOf(v);
+ }
+
+ @Override
+ public Float unmarshal(String v) throws Exception {
+ if (v == null) {
+ return -1f;
+ }
+
+ return Float.valueOf(v.split(" ")[0]);
+ }
+ }
+
+ /**
+ * Convert formats like "725 MiB" to long.
+ */
+ @InterfaceAudience.Private
+ @InterfaceStability.Unstable
+ static class StrToMemAdapter extends XmlAdapter<String, Long> {
+ @Override
+ public String marshal(Long v) throws Exception {
+ if (v == null) {
+ return "";
+ }
+ return String.valueOf(v) + " MiB";
+ }
+
+ @Override
+ public Long unmarshal(String v) throws Exception {
+ if (v == null) {
+ return -1L;
+ }
+ return Long.valueOf(v.split(" ")[0]);
+ }
+ }
+
+ @XmlElement(name = "temperature")
+ public PerGpuTemperature getTemperature() {
+ return temperature;
+ }
+
+ public void setTemperature(PerGpuTemperature temperature) {
+ this.temperature = temperature;
+ }
+
+ @XmlElement(name = "uuid")
+ public String getUuid() {
+ return uuid;
+ }
+
+ public void setUuid(String uuid) {
+ this.uuid = uuid;
+ }
+
+ @XmlElement(name = "product_name")
+ public String getProductName() {
+ return productName;
+ }
+
+ public void setProductName(String productName) {
+ this.productName = productName;
+ }
+
+ @XmlElement(name = "minor_number")
+ public int getMinorNumber() {
+ return minorNumber;
+ }
+
+ public void setMinorNumber(int minorNumber) {
+ this.minorNumber = minorNumber;
+ }
+
+ @XmlElement(name = "utilization")
+ public PerGpuUtilizations getGpuUtilizations() {
+ return gpuUtilizations;
+ }
+
+ public void setGpuUtilizations(PerGpuUtilizations utilizations) {
+ this.gpuUtilizations = utilizations;
+ }
+
+ @XmlElement(name = "bar1_memory_usage")
+ public PerGpuMemoryUsage getGpuMemoryUsage() {
+ return gpuMemoryUsage;
+ }
+
+ public void setGpuMemoryUsage(PerGpuMemoryUsage gpuMemoryUsage) {
+ this.gpuMemoryUsage = gpuMemoryUsage;
+ }
+
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("ProductName=").append(productName).append(", MinorNumber=")
+ .append(minorNumber);
+
+ if (getGpuMemoryUsage() != null) {
+ sb.append(", TotalMemory=").append(
+ getGpuMemoryUsage().getTotalMemoryMiB()).append("MiB");
+ }
+
+ if (getGpuUtilizations() != null) {
+ sb.append(", Utilization=").append(
+ getGpuUtilizations().getOverallGpuUtilization()).append("%");
+ }
+ return sb.toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java
new file mode 100644
index 0000000..3964c4e
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
+
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "bar1_memory_usage")
+public class PerGpuMemoryUsage {
+ long usedMemoryMiB = -1L;
+ long availMemoryMiB = -1L;
+
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class)
+ @XmlElement(name = "used")
+ public Long getUsedMemoryMiB() {
+ return usedMemoryMiB;
+ }
+
+ public void setUsedMemoryMiB(Long usedMemoryMiB) {
+ this.usedMemoryMiB = usedMemoryMiB;
+ }
+
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class)
+ @XmlElement(name = "free")
+ public Long getAvailMemoryMiB() {
+ return availMemoryMiB;
+ }
+
+ public void setAvailMemoryMiB(Long availMemoryMiB) {
+ this.availMemoryMiB = availMemoryMiB;
+ }
+
+ public long getTotalMemoryMiB() {
+ return usedMemoryMiB + availMemoryMiB;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java
new file mode 100644
index 0000000..ccd60cb
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
+
+/**
+ * Temperature of GPU
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "temperature")
+public class PerGpuTemperature {
+ private float currentGpuTemp = Float.MIN_VALUE;
+ private float maxGpuTemp = Float.MIN_VALUE;
+ private float slowThresholdGpuTemp = Float.MIN_VALUE;
+
+ /**
+ * Get current celsius GPU temperature
+ * @return temperature
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_temp")
+ public Float getCurrentGpuTemp() {
+ return currentGpuTemp;
+ }
+
+ public void setCurrentGpuTemp(Float currentGpuTemp) {
+ this.currentGpuTemp = currentGpuTemp;
+ }
+
+ /**
+ * Get max possible celsius GPU temperature
+ * @return temperature
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_temp_max_threshold")
+ public Float getMaxGpuTemp() {
+ return maxGpuTemp;
+ }
+
+ public void setMaxGpuTemp(Float maxGpuTemp) {
+ this.maxGpuTemp = maxGpuTemp;
+ }
+
+ /**
+ * Get celsius GPU temperature which could make GPU runs slower
+ * @return temperature
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_temp_slow_threshold")
+ public Float getSlowThresholdGpuTemp() {
+ return slowThresholdGpuTemp;
+ }
+
+ public void setSlowThresholdGpuTemp(Float slowThresholdGpuTemp) {
+ this.slowThresholdGpuTemp = slowThresholdGpuTemp;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java
new file mode 100644
index 0000000..4ef218b
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlRootElement;
+import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
+
+/**
+ * GPU utilizations
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+@XmlRootElement(name = "utilization")
+public class PerGpuUtilizations {
+ private float overallGpuUtilization;
+
+ /**
+ * Overall percent GPU utilization
+ * @return utilization
+ */
+ @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class)
+ @XmlElement(name = "gpu_util")
+ public Float getOverallGpuUtilization() {
+ return overallGpuUtilization;
+ }
+
+ public void setOverallGpuUtilization(Float overallGpuUtilization) {
+ this.overallGpuUtilization = overallGpuUtilization;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java
new file mode 100644
index 0000000..13b3ee9
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java
@@ -0,0 +1,164 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.net.ServerSocketUtil;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.event.Dispatcher;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.factories.RecordFactory;
+import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
+import org.apache.hadoop.yarn.server.api.ResourceTracker;
+import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest;
+import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
+import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
+import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
+import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerRequest;
+import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerResponse;
+import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.NodeHeartbeatResponsePBImpl;
+import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.RegisterNodeManagerResponsePBImpl;
+import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.UnRegisterNodeManagerResponsePBImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
+import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
+import org.junit.Assert;
+import org.junit.Before;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+public class NodeManagerTestBase {
+ // temp fix until metrics system can auto-detect itself running in unit test:
+ static {
+ DefaultMetricsSystem.setMiniClusterMode(true);
+ }
+
+ protected static final Logger LOG =
+ LoggerFactory.getLogger(TestNodeStatusUpdater.class);
+ protected static final File basedir =
+ new File("target", TestNodeStatusUpdater.class.getName());
+ protected static final File nmLocalDir = new File(basedir, "nm0");
+ protected static final File tmpDir = new File(basedir, "tmpDir");
+ protected static final File remoteLogsDir = new File(basedir, "remotelogs");
+ protected static final File logsDir = new File(basedir, "logs");
+ protected static final RecordFactory recordFactory = RecordFactoryProvider
+ .getRecordFactory(null);
+ protected Configuration conf;
+
+ protected YarnConfiguration createNMConfig() throws IOException {
+ return createNMConfig(ServerSocketUtil.getPort(49170, 10));
+ }
+
+ protected YarnConfiguration createNMConfig(int port) throws IOException {
+ YarnConfiguration conf = new YarnConfiguration();
+ String localhostAddress = null;
+ try {
+ localhostAddress = InetAddress.getByName("localhost")
+ .getCanonicalHostName();
+ } catch (UnknownHostException e) {
+ Assert.fail("Unable to get localhost address: " + e.getMessage());
+ }
+ conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB
+ conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port);
+ conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":"
+ + ServerSocketUtil.getPort(49160, 10));
+ conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
+ conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
+ remoteLogsDir.getAbsolutePath());
+ conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
+ conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
+ return conf;
+ }
+
+ public static class BaseResourceTrackerForTest implements ResourceTracker {
+ @Override
+ public RegisterNodeManagerResponse registerNodeManager(
+ RegisterNodeManagerRequest request) throws YarnException, IOException {
+ return new RegisterNodeManagerResponsePBImpl();
+ }
+
+ @Override
+ public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request)
+ throws YarnException, IOException {
+ return new NodeHeartbeatResponsePBImpl();
+ }
+
+ @Override
+ public UnRegisterNodeManagerResponse unRegisterNodeManager(
+ UnRegisterNodeManagerRequest request)
+ throws YarnException, IOException {
+ return new UnRegisterNodeManagerResponsePBImpl();
+ }
+ }
+
+ protected static class BaseNodeStatusUpdaterForTest extends NodeStatusUpdaterImpl {
+ public ResourceTracker resourceTracker;
+ protected Context context;
+
+ public BaseNodeStatusUpdaterForTest(Context context, Dispatcher dispatcher,
+ NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
+ ResourceTracker resourceTracker) {
+ super(context, dispatcher, healthChecker, metrics);
+ this.context = context;
+ this.resourceTracker = resourceTracker;
+ }
+ @Override
+ protected ResourceTracker getRMClient() {
+ return resourceTracker;
+ }
+
+ @Override
+ protected void stopRMProxy() {
+ return;
+ }
+ }
+
+ public class MyContainerManager extends ContainerManagerImpl {
+ public boolean signaled = false;
+
+ public MyContainerManager(Context context, ContainerExecutor exec,
+ DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
+ NodeManagerMetrics metrics,
+ LocalDirsHandlerService dirsHandler) {
+ super(context, exec, deletionContext, nodeStatusUpdater,
+ metrics, dirsHandler);
+ }
+
+ @Override
+ public void handle(ContainerManagerEvent event) {
+ if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) {
+ signaled = true;
+ }
+ }
+ }
+
+ @Before
+ public void setUp() throws IOException {
+ nmLocalDir.mkdirs();
+ tmpDir.mkdirs();
+ logsDir.mkdirs();
+ remoteLogsDir.mkdirs();
+ conf = createNMConfig();
+ }
+}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
index 2e9eff5..9b180c7 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java
@@ -178,7 +178,7 @@ public class TestDefaultContainerExecutor {
FileContext lfs = FileContext.getLocalFSFileContext(conf);
DefaultContainerExecutor executor = new DefaultContainerExecutor(lfs);
executor.setConf(conf);
- executor.init();
+ executor.init(null);
try {
executor.createUserLocalDirs(localDirs, user);
@@ -317,7 +317,7 @@ public class TestDefaultContainerExecutor {
Path workDir = localDir;
Path pidFile = new Path(workDir, "pid.txt");
- mockExec.init();
+ mockExec.init(null);
mockExec.activateContainer(cId, pidFile);
int ret = mockExec.launchContainer(new ContainerStartContext.Builder()
.setContainer(container)
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
index cf8d977..95c8f5e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
@@ -628,7 +628,7 @@ public class TestLinuxContainerExecutor {
LinuxContainerExecutor lce = new LinuxContainerExecutor();
lce.setConf(conf);
try {
- lce.init();
+ lce.init(null);
} catch (IOException e) {
// expected if LCE isn't setup right, but not necessary for this test
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
index 79b88cf..249e017 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java
@@ -426,7 +426,7 @@ public class TestLinuxContainerExecutorWithMocks {
@Test
public void testInit() throws Exception {
- mockExec.init();
+ mockExec.init(mock(Context.class));
assertEquals(Arrays.asList("--checksetup"), readMockParams());
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
index 9279711..b31215b 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java
@@ -37,7 +37,7 @@ public class TestNodeManager {
public static final class InvalidContainerExecutor extends
DefaultContainerExecutor {
@Override
- public void init() throws IOException {
+ public void init(Context nmContext) throws IOException {
throw new IOException("dummy executor init called");
}
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
index 055dab4..533cf2a 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
@@ -20,16 +20,14 @@ package org.apache.hadoop.yarn.server.nodemanager;
import static org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils.newNodeHeartbeatResponse;
import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
-import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
@@ -80,8 +78,6 @@ import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
-import org.apache.hadoop.yarn.factories.RecordFactory;
-import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NodeHeartbeatResponseProto;
import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
import org.apache.hadoop.yarn.server.api.ResourceTracker;
@@ -117,41 +113,14 @@ import org.junit.Before;
import org.junit.Test;
@SuppressWarnings("rawtypes")
-public class TestNodeStatusUpdater {
-
- // temp fix until metrics system can auto-detect itself running in unit test:
- static {
- DefaultMetricsSystem.setMiniClusterMode(true);
- }
-
- static final Logger LOG =
- LoggerFactory.getLogger(TestNodeStatusUpdater.class);
- static final File basedir =
- new File("target", TestNodeStatusUpdater.class.getName());
- static final File nmLocalDir = new File(basedir, "nm0");
- static final File tmpDir = new File(basedir, "tmpDir");
- static final File remoteLogsDir = new File(basedir, "remotelogs");
- static final File logsDir = new File(basedir, "logs");
- private static final RecordFactory recordFactory = RecordFactoryProvider
- .getRecordFactory(null);
-
+public class TestNodeStatusUpdater extends NodeManagerTestBase {
volatile int heartBeatID = 0;
volatile Throwable nmStartError = null;
private final List<NodeId> registeredNodes = new ArrayList<NodeId>();
private boolean triggered = false;
- private Configuration conf;
private NodeManager nm;
private AtomicBoolean assertionFailedInThread = new AtomicBoolean(false);
- @Before
- public void setUp() throws IOException {
- nmLocalDir.mkdirs();
- tmpDir.mkdirs();
- logsDir.mkdirs();
- remoteLogsDir.mkdirs();
- conf = createNMConfig();
- }
-
@After
public void tearDown() {
this.registeredNodes.clear();
@@ -332,29 +301,7 @@ public class TestNodeStatusUpdater {
}
}
- private class MyContainerManager extends ContainerManagerImpl {
- public boolean signaled = false;
-
- public MyContainerManager(Context context, ContainerExecutor exec,
- DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
- NodeManagerMetrics metrics,
- LocalDirsHandlerService dirsHandler) {
- super(context, exec, deletionContext, nodeStatusUpdater,
- metrics, dirsHandler);
- }
-
- @Override
- public void handle(ContainerManagerEvent event) {
- if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) {
- signaled = true;
- }
- }
- }
-
- private class MyNodeStatusUpdater extends NodeStatusUpdaterImpl {
- public ResourceTracker resourceTracker;
- private Context context;
-
+ private class MyNodeStatusUpdater extends BaseNodeStatusUpdaterForTest {
public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) {
this(context, dispatcher, healthChecker, metrics, false);
@@ -363,19 +310,8 @@ public class TestNodeStatusUpdater {
public MyNodeStatusUpdater(Context context, Dispatcher dispatcher,
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
boolean signalContainer) {
- super(context, dispatcher, healthChecker, metrics);
- this.context = context;
- resourceTracker = new MyResourceTracker(this.context, signalContainer);
- }
-
- @Override
- protected ResourceTracker getRMClient() {
- return resourceTracker;
- }
-
- @Override
- protected void stopRMProxy() {
- return;
+ super(context, dispatcher, healthChecker, metrics,
+ new MyResourceTracker(context, signalContainer));
}
}
@@ -1818,7 +1754,6 @@ public class TestNodeStatusUpdater {
Assert.assertTrue("Test failed with exception(s)" + exceptions,
exceptions.isEmpty());
}
-
// Add new containers info into NM context each time node heart beats.
private class MyNMContext extends NMContext {
@@ -1922,31 +1857,6 @@ public class TestNodeStatusUpdater {
this.registeredNodes.size());
}
- private YarnConfiguration createNMConfig(int port) throws IOException {
- YarnConfiguration conf = new YarnConfiguration();
- String localhostAddress = null;
- try {
- localhostAddress = InetAddress.getByName("localhost")
- .getCanonicalHostName();
- } catch (UnknownHostException e) {
- Assert.fail("Unable to get localhost address: " + e.getMessage());
- }
- conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB
- conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port);
- conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":"
- + ServerSocketUtil.getPort(49160, 10));
- conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
- conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
- remoteLogsDir.getAbsolutePath());
- conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
- conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1);
- return conf;
- }
-
- private YarnConfiguration createNMConfig() throws IOException {
- return createNMConfig(ServerSocketUtil.getPort(49170, 10));
- }
-
private NodeManager getNodeManager(final NodeAction nodeHeartBeatAction) {
return new NodeManager() {
@Override
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
index 0958191..c9c0a38 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
@@ -18,26 +18,6 @@
package org.apache.hadoop.yarn.server.nodemanager.amrmproxy;
-import java.io.IOException;
-import java.security.PrivilegedExceptionAction;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.ExecutorCompletionService;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
@@ -65,6 +45,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport;
import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener;
import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
@@ -73,18 +54,37 @@ import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredAMRMProxyState;
-import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM;
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
+import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.util.Records;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.security.PrivilegedExceptionAction;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
/**
* Base class for all the AMRMProxyService test cases. It provides utility
@@ -803,5 +803,9 @@ public abstract class BaseAMRMProxyTest {
public NMTimelinePublisher getNMTimelinePublisher() {
return null;
}
+
+ public ResourcePluginManager getResourcePluginManager() {
+ return null;
+ }
}
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
index e5414a5..0563694 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java
@@ -22,6 +22,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resourc
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
@@ -30,6 +31,8 @@ import org.slf4j.LoggerFactory;
import java.util.List;
+import static org.mockito.Mockito.mock;
+
public class TestResourceHandlerModule {
private static final Logger LOG =
LoggerFactory.getLogger(TestResourceHandlerModule.class);
@@ -62,7 +65,7 @@ public class TestResourceHandlerModule {
//Ensure that outbound bandwidth resource handler is present in the chain
ResourceHandlerChain resourceHandlerChain = ResourceHandlerModule
- .getConfiguredResourceHandlerChain(networkEnabledConf);
+ .getConfiguredResourceHandlerChain(networkEnabledConf, mock(Context.class));
List<ResourceHandler> resourceHandlers = resourceHandlerChain
.getResourceHandlerList();
//Exactly one resource handler in chain
@@ -88,7 +91,8 @@ public class TestResourceHandlerModule {
Assert.assertNotNull(handler);
ResourceHandlerChain resourceHandlerChain =
- ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf);
+ ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf,
+ mock(Context.class));
List<ResourceHandler> resourceHandlers =
resourceHandlerChain.getResourceHandlerList();
// Exactly one resource handler in chain
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
new file mode 100644
index 0000000..5c70f7a
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
@@ -0,0 +1,382 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes;
+import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
+import org.apache.hadoop.yarn.api.records.ApplicationId;
+import org.apache.hadoop.yarn.api.records.ContainerId;
+import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceInformation;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.nodemanager.Context;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
+import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
+import org.apache.hadoop.yarn.util.resource.ResourceUtils;
+import org.apache.hadoop.yarn.util.resource.TestResourceUtils;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.anyList;
+import static org.mockito.Matchers.anyString;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.doThrow;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+public class TestGpuResourceHandler {
+ private CGroupsHandler mockCGroupsHandler;
+ private PrivilegedOperationExecutor mockPrivilegedExecutor;
+ private GpuResourceHandlerImpl gpuResourceHandler;
+ private NMStateStoreService mockNMStateStore;
+ private ConcurrentHashMap<ContainerId, Container> runningContainersMap;
+
+ @Before
+ public void setup() {
+ TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
+
+ mockCGroupsHandler = mock(CGroupsHandler.class);
+ mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class);
+ mockNMStateStore = mock(NMStateStoreService.class);
+
+ Context nmctx = mock(Context.class);
+ when(nmctx.getNMStateStore()).thenReturn(mockNMStateStore);
+ runningContainersMap = new ConcurrentHashMap<>();
+ when(nmctx.getContainers()).thenReturn(runningContainersMap);
+
+ gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler,
+ mockPrivilegedExecutor);
+ }
+
+ @Test
+ public void testBootStrap() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0");
+
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ verify(mockCGroupsHandler, times(1)).initializeCGroupController(
+ CGroupsHandler.CGroupController.DEVICES);
+ }
+
+ private static ContainerId getContainerId(int id) {
+ return ContainerId.newContainerId(ApplicationAttemptId
+ .newInstance(ApplicationId.newInstance(1234L, 1), 1), id);
+ }
+
+ private static Container mockContainerWithGpuRequest(int id,
+ int numGpuRequest) {
+ Container c = mock(Container.class);
+ when(c.getContainerId()).thenReturn(getContainerId(id));
+
+ Resource res = Resource.newInstance(1024, 1);
+ ResourceMappings resMapping = new ResourceMappings();
+
+ res.setResourceValue(ResourceInformation.GPU_URI, numGpuRequest);
+ when(c.getResource()).thenReturn(res);
+ when(c.getResourceMappings()).thenReturn(resMapping);
+ return c;
+ }
+
+ private void verifyDeniedDevices(ContainerId containerId,
+ List<Integer> deniedDevices)
+ throws ResourceHandlerException, PrivilegedOperationException {
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, containerId.toString());
+
+ if (null != deniedDevices && !deniedDevices.isEmpty()) {
+ verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation(
+ new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays
+ .asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION,
+ containerId.toString(),
+ GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION,
+ StringUtils.join(",", deniedDevices))), true);
+ }
+ }
+
+ @Test
+ public void testAllocation() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Start container 1, asks 3 containers */
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
+
+ // Only device=4 will be blocked.
+ verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
+
+ /* Start container 2, asks 2 containers. Excepted to fail */
+ boolean failedToAllocate = false;
+ try {
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 2));
+ } catch (ResourceHandlerException e) {
+ failedToAllocate = true;
+ }
+ Assert.assertTrue(failedToAllocate);
+
+ /* Start container 3, ask 1 container, succeeded */
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(3, 1));
+
+ // devices = 0/1/3 will be blocked
+ verifyDeniedDevices(getContainerId(3), Arrays.asList(0, 1, 3));
+
+ /* Start container 4, ask 0 container, succeeded */
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(4, 0));
+
+ // All devices will be blocked
+ verifyDeniedDevices(getContainerId(4), Arrays.asList(0, 1, 3, 4));
+
+ /* Release container-1, expect cgroups deleted */
+ gpuResourceHandler.postComplete(getContainerId(1));
+
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
+ Assert.assertEquals(3,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Release container-3, expect cgroups deleted */
+ gpuResourceHandler.postComplete(getContainerId(3));
+
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, getContainerId(3).toString());
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+ }
+
+ @SuppressWarnings("unchecked")
+ @Test
+ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails()
+ throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ doThrow(new IOException("Exception ...")).when(mockNMStateStore)
+ .storeAssignedResources(
+ any(ContainerId.class), anyString(), anyList());
+
+ boolean exception = false;
+ /* Start container 1, asks 3 containers */
+ try {
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3));
+ } catch (ResourceHandlerException e) {
+ exception = true;
+ }
+
+ Assert.assertTrue("preStart should throw exception", exception);
+
+ // After preStart, we still have 4 available GPU since the store op fails.
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+ }
+
+ @Test
+ public void testAllocationWithoutAllowedGpus() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(0,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Start container 1, asks 0 containers */
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0));
+ verifyDeniedDevices(getContainerId(1), Collections.emptyList());
+
+ /* Start container 2, asks 1 containers. Excepted to fail */
+ boolean failedToAllocate = false;
+ try {
+ gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 1));
+ } catch (ResourceHandlerException e) {
+ failedToAllocate = true;
+ }
+ Assert.assertTrue(failedToAllocate);
+
+ /* Release container 1, expect cgroups deleted */
+ gpuResourceHandler.postComplete(getContainerId(1));
+
+ verify(mockCGroupsHandler, times(1)).createCGroup(
+ CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString());
+ Assert.assertEquals(0,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+ }
+
+ @Test
+ public void testAllocationStored() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ /* Start container 1, asks 3 containers */
+ Container container = mockContainerWithGpuRequest(1, 3);
+ gpuResourceHandler.preStart(container);
+
+ verify(mockNMStateStore).storeAssignedResources(getContainerId(1),
+ ResourceInformation.GPU_URI,
+ Arrays.asList("0", "1", "3"));
+
+ Assert.assertEquals(3, container.getResourceMappings()
+ .getAssignedResources(ResourceInformation.GPU_URI).size());
+
+ // Only device=4 will be blocked.
+ verifyDeniedDevices(getContainerId(1), Arrays.asList(4));
+
+ /* Start container 2, ask 0 container, succeeded */
+ container = mockContainerWithGpuRequest(2, 0);
+ gpuResourceHandler.preStart(container);
+
+ verifyDeniedDevices(getContainerId(2), Arrays.asList(0, 1, 3, 4));
+ Assert.assertEquals(0, container.getResourceMappings()
+ .getAssignedResources(ResourceInformation.GPU_URI).size());
+
+ // Store assigned resource will not be invoked.
+ verify(mockNMStateStore, never()).storeAssignedResources(
+ eq(getContainerId(2)), eq(ResourceInformation.GPU_URI), anyList());
+ }
+
+ @Test
+ public void testRecoverResourceAllocation() throws Exception {
+ Configuration conf = new YarnConfiguration();
+ conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4");
+ GpuDiscoverer.getInstance().initialize(conf);
+
+ gpuResourceHandler.bootstrap(conf);
+ Assert.assertEquals(4,
+ gpuResourceHandler.getGpuAllocator().getAvailableGpus());
+
+ Container nmContainer = mock(Container.class);
+ ResourceMappings rmap = new ResourceMappings();
+ ResourceMappings.AssignedResources ar =
+ new ResourceMappings.AssignedResources();
+ ar.updateAssignedResources(Arrays.asList("1", "3"));
+ rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
+ when(nmContainer.getResourceMappings()).thenReturn(rmap);
+
+ runningContainersMap.put(getContainerId(1), nmContainer);
+
+ // TEST CASE
+ // Reacquire container restore state of GPU Resource Allocator.
+ gpuResourceHandler.reacquireContainer(getContainerId(1));
+
+ Map<Integer, ContainerId> deviceAllocationMapping =
+ gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
+ Assert.assertEquals(2, deviceAllocationMapping.size());
+ Assert.assertTrue(
+ deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
+ Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+
+ // TEST CASE
+ // Try to reacquire a container but requested device is not in allowed list.
+ nmContainer = mock(Container.class);
+ rmap = new ResourceMappings();
+ ar = new ResourceMappings.AssignedResources();
+ // id=5 is not in allowed list.
+ ar.updateAssignedResources(Arrays.asList("4", "5"));
+ rmap.addAssignedResources(ResourceInformation.GPU_URI, ar);
+ when(nmContainer.getResourceMappings()).thenReturn(rmap);
+
+ runningContainersMap.put(getContainerId(2), nmContainer);
+
+ boolean caughtException = false;
+ try {
+ gpuResourceHandler.reacquireContainer(getContainerId(1));
+ } catch (ResourceHandlerException e) {
+ caughtException = true;
+ }
+ Assert.assertTrue(
+ "Should fail since requested device Id is not in allowed list",
+ caughtException);
+
+ // Make sure internal state not changed.
+ deviceAllocationMapping =
+ gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
+ Assert.assertEquals(2, deviceAllocationMapping.size());
+ Assert.assertTrue(
+ deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
+ Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+
+ // TEST CASE
+ // Try to reacquire a container but requested device is already assigned.
+ nmContainer = mock(Container.class);
+ rmap = new ResourceMappings();
+ ar = new ResourceMappings.AssignedResources();
+ // id=3 is already assigned
+ ar.updateAssignedResources(Arrays.asList("4", "3"));
+ rmap.addAssignedResources("gpu", ar);
+ when(nmContainer.getResourceMappings()).thenReturn(rmap);
+
+ runningContainersMap.put(getContainerId(2), nmContainer);
+
+ caughtException = false;
+ try {
+ gpuResourceHandler.reacquireContainer(getContainerId(1));
+ } catch (ResourceHandlerException e) {
+ caughtException = true;
+ }
+ Assert.assertTrue(
+ "Should fail since requested device Id is not in allowed list",
+ caughtException);
+
+ // Make sure internal state not changed.
+ deviceAllocationMapping =
+ gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping();
+ Assert.assertEquals(2, deviceAllocationMapping.size());
+ Assert.assertTrue(
+ deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3)));
+ Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1));
+ }
+}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
index 318ae6b..a147afb 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java
@@ -70,7 +70,7 @@ public class TestContainersMonitorResourceChange {
private static class MockExecutor extends ContainerExecutor {
@Override
- public void init() throws IOException {
+ public void init(Context nmContext) throws IOException {
}
@Override
public void startLocalizer(LocalizerStartContext ctx)
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org