You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ka...@apache.org on 2016/01/25 18:26:02 UTC

[27/50] [abbrv] hadoop git commit: YARN-4584. RM startup failure when AM attempts greater than max-attempts. (Bibin A Chundatt via rohithsharmaks)

YARN-4584. RM startup failure when AM attempts greater than max-attempts. (Bibin A Chundatt via rohithsharmaks)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/e3066810
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/e3066810
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/e3066810

Branch: refs/heads/YARN-1011
Commit: e30668106dc246f68db36fbd1f2db6ec08cd96f2
Parents: b2ffcc2
Author: Rohith Sharma K S <ro...@apache.org>
Authored: Fri Jan 22 10:14:46 2016 +0530
Committer: Rohith Sharma K S <ro...@apache.org>
Committed: Fri Jan 22 10:14:46 2016 +0530

----------------------------------------------------------------------
 hadoop-yarn-project/CHANGES.txt                 |  3 +
 .../server/resourcemanager/rmapp/RMAppImpl.java | 23 +++++---
 .../server/resourcemanager/TestRMRestart.java   | 58 ++++++++++++++++++++
 3 files changed, 77 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/e3066810/hadoop-yarn-project/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index f789bcb..a7a63b1 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -137,6 +137,9 @@ Release 2.9.0 - UNRELEASED
     YARN-4611. Fix scheduler load simulator to support multi-layer network
     location. (Ming Ma via xgong)
 
+    YARN-4584. RM startup failure when AM attempts greater than max-attempts.
+    (Bibin A Chundatt via rohithsharmaks)
+
 Release 2.8.0 - UNRELEASED
 
   INCOMPATIBLE CHANGES

http://git-wip-us.apache.org/repos/asf/hadoop/blob/e3066810/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
index 6ecc7d3..1a390df 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
@@ -841,7 +841,7 @@ public class RMAppImpl implements RMApp, Recoverable {
     this.startTime = appState.getStartTime();
     this.callerContext = appState.getCallerContext();
     // If interval > 0, some attempts might have been deleted.
-    if (submissionContext.getAttemptFailuresValidityInterval() > 0) {
+    if (this.attemptFailuresValidityInterval > 0) {
       this.firstAttemptIdInStateStore = appState.getFirstAttemptId();
       this.nextAttemptId = firstAttemptIdInStateStore;
     }
@@ -1341,7 +1341,9 @@ public class RMAppImpl implements RMApp, Recoverable {
           + "is " + numberOfFailure + ". The max attempts is "
           + app.maxAppAttempts);
 
-      removeExcessAttempts(app);
+      if (app.attemptFailuresValidityInterval > 0) {
+        removeExcessAttempts(app);
+      }
 
       if (!app.submissionContext.getUnmanagedAM()
           && numberOfFailure < app.maxAppAttempts) {
@@ -1381,15 +1383,22 @@ public class RMAppImpl implements RMApp, Recoverable {
     }
 
     private void removeExcessAttempts(RMAppImpl app) {
-      while (app.nextAttemptId - app.firstAttemptIdInStateStore
-          > app.maxAppAttempts) {
+      while (app.nextAttemptId
+          - app.firstAttemptIdInStateStore > app.maxAppAttempts) {
         // attempts' first element is oldest attempt because it is a
         // LinkedHashMap
         ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(
             app.getApplicationId(), app.firstAttemptIdInStateStore);
-        app.firstAttemptIdInStateStore++;
-        LOG.info("Remove attempt from state store : " + attemptId);
-        app.rmContext.getStateStore().removeApplicationAttempt(attemptId);
+        RMAppAttempt rmAppAttempt = app.getRMAppAttempt(attemptId);
+        long endTime = app.systemClock.getTime();
+        if (rmAppAttempt.getFinishTime() < (endTime
+            - app.attemptFailuresValidityInterval)) {
+          app.firstAttemptIdInStateStore++;
+          LOG.info("Remove attempt from state store : " + attemptId);
+          app.rmContext.getStateStore().removeApplicationAttempt(attemptId);
+        } else {
+          break;
+        }
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/e3066810/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
index bad68f4..625c3b8 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java
@@ -42,6 +42,8 @@ import java.util.Map;
 import java.util.Set;
 
 import org.apache.commons.io.FileUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
 import org.apache.hadoop.io.DataOutputBuffer;
@@ -104,6 +106,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
 import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
 import org.apache.hadoop.yarn.server.utils.BuilderUtils;
 import org.apache.hadoop.yarn.util.ConverterUtils;
@@ -121,6 +125,7 @@ import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Sets;
 
 public class TestRMRestart extends ParameterizedSchedulerTestBase {
+  private static final Log LOG = LogFactory.getLog(TestRMRestart.class);
   private final static File TEMP_DIR = new File(System.getProperty(
     "test.build.data", "/tmp"), "decommision");
   private File hostFile = new File(TEMP_DIR + File.separator + "hostFile.txt");
@@ -2321,4 +2326,57 @@ public class TestRMRestart extends ParameterizedSchedulerTestBase {
     rm2.stop();
   }
 
+  @Test(timeout = 120000)
+  public void testRMRestartAfterPreemption() throws Exception {
+    Configuration conf = new Configuration();
+    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 2);
+    if (!getSchedulerType().equals(SchedulerType.CAPACITY)) {
+      return;
+    }
+    MemoryRMStateStore memStore = new MemoryRMStateStore();
+    memStore.init(conf);
+    // start RM
+    MockRM rm1 = new MockRM(conf, memStore);
+    rm1.start();
+    MockNM nm1 =
+        new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
+    nm1.registerNode();
+    int CONTAINER_MEMORY = 1024;
+    // create app and launch the AM
+    RMApp app0 = rm1.submitApp(CONTAINER_MEMORY);
+    MockAM am0 = MockRM.launchAM(app0, rm1, nm1);
+    nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1,
+        ContainerState.COMPLETE);
+    am0.waitForState(RMAppAttemptState.FAILED);
+    for (int i = 0; i < 4; i++) {
+      am0 = MockRM.launchAM(app0, rm1, nm1);
+      am0.registerAppAttempt();
+      CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
+      // get scheduler app
+      FiCaSchedulerApp schedulerAppAttempt = cs.getSchedulerApplications()
+          .get(app0.getApplicationId()).getCurrentAppAttempt();
+      // kill app0-attempt
+      cs.killPreemptedContainer(schedulerAppAttempt.getRMContainer(
+          app0.getCurrentAppAttempt().getMasterContainer().getId()));
+    }
+    am0 = MockRM.launchAM(app0, rm1, nm1);
+    am0.registerAppAttempt();
+    rm1.killApp(app0.getApplicationId());
+    rm1.waitForState(app0.getCurrentAppAttempt().getAppAttemptId(),
+        RMAppAttemptState.KILLED);
+
+    MockRM rm2 = null;
+    // start RM2
+    try {
+      rm2 = new MockRM(conf, memStore);
+      rm2.start();
+      Assert.assertTrue("RM start successfully", true);
+    } catch (Exception e) {
+      LOG.debug("Exception on start", e);
+      Assert.fail("RM should start with out any issue");
+    } finally {
+      rm1.stop();
+    }
+  }
+
 }