You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by su...@apache.org on 2017/02/08 10:34:03 UTC
hadoop git commit: YARN-6031. Application recovery has failed when
node label feature is turned off during RM recovery. Contributed by Ying
Zhang.
Repository: hadoop
Updated Branches:
refs/heads/branch-2.8 2bbcaa8ad -> 959db3866
YARN-6031. Application recovery has failed when node label feature is turned off during RM recovery. Contributed by Ying Zhang.
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/959db386
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/959db386
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/959db386
Branch: refs/heads/branch-2.8
Commit: 959db386626aea63f9fa0cedcb2c06887283b71f
Parents: 2bbcaa8
Author: Sunil G <su...@apache.org>
Authored: Wed Feb 8 16:03:45 2017 +0530
Committer: Sunil G <su...@apache.org>
Committed: Wed Feb 8 16:03:45 2017 +0530
----------------------------------------------------------------------
.../server/resourcemanager/RMAppManager.java | 54 +++++++++++++-
.../server/resourcemanager/TestRMRestart.java | 76 ++++++++++++++++++++
2 files changed, 128 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hadoop/blob/959db386/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java
index b2f06ab..ff0bebb 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java
@@ -37,6 +37,7 @@ import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
+import org.apache.hadoop.yarn.exceptions.InvalidLabelResourceRequestException;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.ipc.RPCUtil;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
@@ -55,6 +56,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
@@ -322,6 +324,34 @@ public class RMAppManager implements EventHandler<RMAppManagerEvent>,
createAndPopulateNewRMApp(appContext, appState.getSubmitTime(),
appState.getUser(), true);
+ // If null amReq has been returned, check if it is the case that
+ // application has specified node label expression while node label
+ // has been disabled. Reject the recovery of this application if it
+ // is true and give clear message so that user can react properly.
+ if (!appContext.getUnmanagedAM() &&
+ application.getAMResourceRequest() == null &&
+ !YarnConfiguration.areNodeLabelsEnabled(this.conf)) {
+ // check application submission context and see if am resource request
+ // or application itself contains any node label expression.
+ ResourceRequest amReqFromAppContext =
+ appContext.getAMContainerResourceRequest();
+ String labelExp = (amReqFromAppContext != null) ?
+ amReqFromAppContext.getNodeLabelExpression() : null;
+ if (labelExp == null) {
+ labelExp = appContext.getNodeLabelExpression();
+ }
+ if (labelExp != null &&
+ !labelExp.equals(RMNodeLabelsManager.NO_LABEL)) {
+ String message = "Failed to recover application " + appId
+ + ". NodeLabel is not enabled in cluster, but AM resource request "
+ + "contains a label expression.";
+ LOG.warn(message);
+ application.handle(
+ new RMAppEvent(appId, RMAppEventType.APP_REJECTED, message));
+ return;
+ }
+ }
+
application.handle(new RMAppRecoverEvent(appId, rmState));
}
@@ -338,8 +368,28 @@ public class RMAppManager implements EventHandler<RMAppManagerEvent>,
}
ApplicationId applicationId = submissionContext.getApplicationId();
- ResourceRequest amReq =
- validateAndCreateResourceRequest(submissionContext, isRecovery);
+ ResourceRequest amReq = null;
+ try {
+ amReq = validateAndCreateResourceRequest(submissionContext, isRecovery);
+ } catch (InvalidLabelResourceRequestException e) {
+ // This can happen if the application had been submitted and run
+ // with Node Label enabled but recover with Node Label disabled.
+ // Thus there might be node label expression in the application's
+ // resource requests. If this is the case, create RmAppImpl with
+ // null amReq and reject the application later with clear error
+ // message. So that the application can still be tracked by RM
+ // after recovery and user can see what's going on and react accordingly.
+ if (isRecovery &&
+ !YarnConfiguration.areNodeLabelsEnabled(this.conf)) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("AMResourceRequest is not created for " + applicationId
+ + ". NodeLabel is not enabled in cluster, but AM resource "
+ + "request contains a label expression.");
+ }
+ } else {
+ throw e;
+ }
+ }
// Verify and get the update application priority and set back to
// submissionContext
http://git-wip-us.apache.org/repos/asf/hadoop/blob/959db386/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
index ba12ca4..76a3488 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
@@ -43,6 +43,8 @@ import java.util.Set;
import java.util.concurrent.ConcurrentMap;
import org.apache.commons.io.FileUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.io.DataOutputBuffer;
@@ -104,6 +106,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptS
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.TestUtils;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.apache.hadoop.yarn.util.ConverterUtils;
@@ -121,6 +124,7 @@ import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Sets;
public class TestRMRestart extends ParameterizedSchedulerTestBase {
+ private static final Log LOG = LogFactory.getLog(TestRMRestart.class);
private final static File TEMP_DIR = new File(System.getProperty(
"test.build.data", "/tmp"), "decommision");
private File hostFile = new File(TEMP_DIR + File.separator + "hostFile.txt");
@@ -2337,4 +2341,76 @@ public class TestRMRestart extends ParameterizedSchedulerTestBase {
rm2.stop();
}
+ @Test(timeout = 60000)
+ public void testRMRestartAfterNodeLabelDisabled() throws Exception {
+ // Skip this test case if it is not CapacityScheduler since NodeLabel is
+ // not fully supported yet for FairScheduler and others.
+ if (!getSchedulerType().equals(SchedulerType.CAPACITY)) {
+ return;
+ }
+ MemoryRMStateStore memStore = new MemoryRMStateStore();
+ memStore.init(conf);
+
+ conf.setBoolean(YarnConfiguration.NODE_LABELS_ENABLED, true);
+
+ MockRM rm1 = new MockRM(
+ TestUtils.getConfigurationWithDefaultQueueLabels(conf), memStore) {
+ @Override
+ protected RMNodeLabelsManager createNodeLabelManager() {
+ RMNodeLabelsManager mgr = new RMNodeLabelsManager();
+ mgr.init(getConfig());
+ return mgr;
+ }
+ };
+ rm1.start();
+
+ // add node label "x" and set node to label mapping
+ Set<String> clusterNodeLabels = new HashSet<String>();
+ clusterNodeLabels.add("x");
+ RMNodeLabelsManager nodeLabelManager =
+ rm1.getRMContext().getNodeLabelManager();
+ nodeLabelManager.
+ addToCluserNodeLabelsWithDefaultExclusivity(clusterNodeLabels);
+ nodeLabelManager.addLabelsToNode(
+ ImmutableMap.of(NodeId.newInstance("h1", 0), toSet("x")));
+ MockNM nm1 = rm1.registerNode("h1:1234", 8000); // label = x
+
+ // submit an application with specifying am node label expression as "x"
+ RMApp app1 = rm1.submitApp(200, "someApp", "someUser", null, "a1", "x");
+ // check am container allocated with correct node label expression
+ MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+ ContainerId amContainerId1 =
+ ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
+ Assert.assertEquals("x", rm1.getRMContext().getScheduler().
+ getRMContainer(amContainerId1).getNodeLabelExpression());
+ finishApplicationMaster(app1, rm1, nm1, am1);
+
+ // restart rm with node label disabled
+ conf.setBoolean(YarnConfiguration.NODE_LABELS_ENABLED, false);
+ MockRM rm2 = new MockRM(
+ TestUtils.getConfigurationWithDefaultQueueLabels(conf), memStore) {
+ @Override
+ protected RMNodeLabelsManager createNodeLabelManager() {
+ RMNodeLabelsManager mgr = new RMNodeLabelsManager();
+ mgr.init(getConfig());
+ return mgr;
+ }
+ };
+
+ // rm should successfully start with app1 loaded back in FAILED state
+ // due to node label not enabled but am resource request contains
+ // node label expression.
+ try {
+ rm2.start();
+ Assert.assertTrue("RM start successfully", true);
+ Assert.assertEquals(1, rm2.getRMContext().getRMApps().size());
+ rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED);
+ } catch (Exception e) {
+ LOG.debug("Exception on start", e);
+ Assert.fail("RM should start without any issue");
+ } finally {
+ rm1.stop();
+ rm2.stop();
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org