You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by su...@apache.org on 2017/02/08 10:34:03 UTC

hadoop git commit: YARN-6031. Application recovery has failed when node label feature is turned off during RM recovery. Contributed by Ying Zhang.

Repository: hadoop
Updated Branches:
  refs/heads/branch-2.8 2bbcaa8ad -> 959db3866


YARN-6031. Application recovery has failed when node label feature is turned off during RM recovery. Contributed by Ying Zhang.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/959db386
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/959db386
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/959db386

Branch: refs/heads/branch-2.8
Commit: 959db386626aea63f9fa0cedcb2c06887283b71f
Parents: 2bbcaa8
Author: Sunil G <su...@apache.org>
Authored: Wed Feb 8 16:03:45 2017 +0530
Committer: Sunil G <su...@apache.org>
Committed: Wed Feb 8 16:03:45 2017 +0530

----------------------------------------------------------------------
 .../server/resourcemanager/RMAppManager.java    | 54 +++++++++++++-
 .../server/resourcemanager/TestRMRestart.java   | 76 ++++++++++++++++++++
 2 files changed, 128 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/959db386/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java
index b2f06ab..ff0bebb 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java
@@ -37,6 +37,7 @@ import org.apache.hadoop.yarn.api.records.ResourceRequest;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.event.EventHandler;
 import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
+import org.apache.hadoop.yarn.exceptions.InvalidLabelResourceRequestException;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.hadoop.yarn.ipc.RPCUtil;
 import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
@@ -55,6 +56,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
 import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
 import org.apache.hadoop.yarn.server.utils.BuilderUtils;
 
@@ -322,6 +324,34 @@ public class RMAppManager implements EventHandler<RMAppManagerEvent>,
         createAndPopulateNewRMApp(appContext, appState.getSubmitTime(),
             appState.getUser(), true);
 
+    // If null amReq has been returned, check if it is the case that
+    // application has specified node label expression while node label
+    // has been disabled. Reject the recovery of this application if it
+    // is true and give clear message so that user can react properly.
+    if (!appContext.getUnmanagedAM() &&
+        application.getAMResourceRequest() == null &&
+        !YarnConfiguration.areNodeLabelsEnabled(this.conf)) {
+      // check application submission context and see if am resource request
+      // or application itself contains any node label expression.
+      ResourceRequest amReqFromAppContext =
+          appContext.getAMContainerResourceRequest();
+      String labelExp = (amReqFromAppContext != null) ?
+          amReqFromAppContext.getNodeLabelExpression() : null;
+      if (labelExp == null) {
+        labelExp = appContext.getNodeLabelExpression();
+      }
+      if (labelExp != null &&
+          !labelExp.equals(RMNodeLabelsManager.NO_LABEL)) {
+        String message = "Failed to recover application " + appId
+            + ". NodeLabel is not enabled in cluster, but AM resource request "
+            + "contains a label expression.";
+        LOG.warn(message);
+        application.handle(
+            new RMAppEvent(appId, RMAppEventType.APP_REJECTED, message));
+        return;
+      }
+    }
+
     application.handle(new RMAppRecoverEvent(appId, rmState));
   }
 
@@ -338,8 +368,28 @@ public class RMAppManager implements EventHandler<RMAppManagerEvent>,
     }
     
     ApplicationId applicationId = submissionContext.getApplicationId();
-    ResourceRequest amReq =
-        validateAndCreateResourceRequest(submissionContext, isRecovery);
+    ResourceRequest amReq = null;
+    try {
+      amReq = validateAndCreateResourceRequest(submissionContext, isRecovery);
+    } catch (InvalidLabelResourceRequestException e) {
+      // This can happen if the application had been submitted and run
+      // with Node Label enabled but recover with Node Label disabled.
+      // Thus there might be node label expression in the application's
+      // resource requests. If this is the case, create RmAppImpl with
+      // null amReq and reject the application later with clear error
+      // message. So that the application can still be tracked by RM
+      // after recovery and user can see what's going on and react accordingly.
+      if (isRecovery &&
+          !YarnConfiguration.areNodeLabelsEnabled(this.conf)) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("AMResourceRequest is not created for " + applicationId
+              + ". NodeLabel is not enabled in cluster, but AM resource "
+              + "request contains a label expression.");
+        }
+      } else {
+        throw e;
+      }
+    }
 
     // Verify and get the update application priority and set back to
     // submissionContext

http://git-wip-us.apache.org/repos/asf/hadoop/blob/959db386/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
index ba12ca4..76a3488 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
@@ -43,6 +43,8 @@ import java.util.Set;
 import java.util.concurrent.ConcurrentMap;
 
 import org.apache.commons.io.FileUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
 import org.apache.hadoop.io.DataOutputBuffer;
@@ -104,6 +106,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptS
 import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.TestUtils;
 import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
 import org.apache.hadoop.yarn.server.utils.BuilderUtils;
 import org.apache.hadoop.yarn.util.ConverterUtils;
@@ -121,6 +124,7 @@ import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Sets;
 
 public class TestRMRestart extends ParameterizedSchedulerTestBase {
+  private static final Log LOG = LogFactory.getLog(TestRMRestart.class);
   private final static File TEMP_DIR = new File(System.getProperty(
     "test.build.data", "/tmp"), "decommision");
   private File hostFile = new File(TEMP_DIR + File.separator + "hostFile.txt");
@@ -2337,4 +2341,76 @@ public class TestRMRestart extends ParameterizedSchedulerTestBase {
     rm2.stop();
   }
 
+  @Test(timeout = 60000)
+  public void testRMRestartAfterNodeLabelDisabled() throws Exception {
+    // Skip this test case if it is not CapacityScheduler since NodeLabel is
+    // not fully supported yet for FairScheduler and others.
+    if (!getSchedulerType().equals(SchedulerType.CAPACITY)) {
+      return;
+    }
+    MemoryRMStateStore memStore = new MemoryRMStateStore();
+    memStore.init(conf);
+
+    conf.setBoolean(YarnConfiguration.NODE_LABELS_ENABLED, true);
+
+    MockRM rm1 = new MockRM(
+        TestUtils.getConfigurationWithDefaultQueueLabels(conf), memStore) {
+      @Override
+      protected RMNodeLabelsManager createNodeLabelManager() {
+        RMNodeLabelsManager mgr = new RMNodeLabelsManager();
+        mgr.init(getConfig());
+        return mgr;
+      }
+    };
+    rm1.start();
+
+    // add node label "x" and set node to label mapping
+    Set<String> clusterNodeLabels = new HashSet<String>();
+    clusterNodeLabels.add("x");
+    RMNodeLabelsManager nodeLabelManager =
+        rm1.getRMContext().getNodeLabelManager();
+    nodeLabelManager.
+        addToCluserNodeLabelsWithDefaultExclusivity(clusterNodeLabels);
+    nodeLabelManager.addLabelsToNode(
+        ImmutableMap.of(NodeId.newInstance("h1", 0), toSet("x")));
+    MockNM nm1 = rm1.registerNode("h1:1234", 8000); // label = x
+
+    // submit an application with specifying am node label expression as "x"
+    RMApp app1 = rm1.submitApp(200, "someApp", "someUser", null, "a1", "x");
+    // check am container allocated with correct node label expression
+    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+    ContainerId  amContainerId1 =
+        ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
+    Assert.assertEquals("x", rm1.getRMContext().getScheduler().
+        getRMContainer(amContainerId1).getNodeLabelExpression());
+    finishApplicationMaster(app1, rm1, nm1, am1);
+
+    // restart rm with node label disabled
+    conf.setBoolean(YarnConfiguration.NODE_LABELS_ENABLED, false);
+    MockRM rm2 = new MockRM(
+        TestUtils.getConfigurationWithDefaultQueueLabels(conf), memStore) {
+      @Override
+      protected RMNodeLabelsManager createNodeLabelManager() {
+        RMNodeLabelsManager mgr = new RMNodeLabelsManager();
+        mgr.init(getConfig());
+        return mgr;
+      }
+    };
+
+    // rm should successfully start with app1 loaded back in FAILED state
+    // due to node label not enabled but am resource request contains
+    // node label expression.
+    try {
+      rm2.start();
+      Assert.assertTrue("RM start successfully", true);
+      Assert.assertEquals(1, rm2.getRMContext().getRMApps().size());
+      rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED);
+    } catch (Exception e) {
+      LOG.debug("Exception on start", e);
+      Assert.fail("RM should start without any issue");
+    } finally {
+      rm1.stop();
+      rm2.stop();
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org