You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-commits@hadoop.apache.org by ma...@apache.org on 2011/05/18 03:04:31 UTC

svn commit: r1104647 - in /hadoop/mapreduce/branches/MR-279: ./ yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/ yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org...

Author: mahadev
Date: Wed May 18 01:04:31 2011
New Revision: 1104647

URL: http://svn.apache.org/viewvc?rev=1104647&view=rev
Log:
Fix job hang if the AM launch fails. (mahadev)

Modified:
    hadoop/mapreduce/branches/MR-279/CHANGES.txt
    hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/AMLauncher.java
    hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/ApplicationMasterInfo.java
    hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/SchedulerNegotiator.java
    hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/events/ApplicationMasterEvents.java

Modified: hadoop/mapreduce/branches/MR-279/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/MR-279/CHANGES.txt?rev=1104647&r1=1104646&r2=1104647&view=diff
==============================================================================
--- hadoop/mapreduce/branches/MR-279/CHANGES.txt (original)
+++ hadoop/mapreduce/branches/MR-279/CHANGES.txt Wed May 18 01:04:31 2011
@@ -3,6 +3,8 @@ Hadoop MapReduce Change Log
 Trunk (unreleased changes)
 
   MAPREDUCE-279
+
+    Fix job hang if the AM launch fails. (mahadev) 
    
     MAPREDUCE-2504. race in JobHistoryEventHandler stop (siddharth seth via mahadev)
 

Modified: hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/AMLauncher.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/AMLauncher.java?rev=1104647&r1=1104646&r2=1104647&view=diff
==============================================================================
--- hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/AMLauncher.java (original)
+++ hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/AMLauncher.java Wed May 18 01:04:31 2011
@@ -256,14 +256,15 @@ public class AMLauncher implements Runna
   public void run() {
     switch (event) {
     case LAUNCH:
+      ApplicationEventType eventType = ApplicationEventType.LAUNCHED;
       try {
         LOG.info("Launching master" + master.getMaster());
         launch();
-        } catch(IOException ie) {
+      } catch(Exception ie) {
         LOG.info("Error launching ", ie);
-        handler.handle(new ASMEvent<ApplicationEventType>(ApplicationEventType.FAILED, master));
+        eventType = ApplicationEventType.LAUNCH_FAILED;
       }
-      handler.handle(new ASMEvent<ApplicationEventType>(ApplicationEventType.LAUNCHED,  master));
+      handler.handle(new ASMEvent<ApplicationEventType>(eventType,  master));
       break;
     case CLEANUP:
       try {

Modified: hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/ApplicationMasterInfo.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/ApplicationMasterInfo.java?rev=1104647&r1=1104646&r2=1104647&view=diff
==============================================================================
--- hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/ApplicationMasterInfo.java (original)
+++ hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/ApplicationMasterInfo.java Wed May 18 01:04:31 2011
@@ -80,6 +80,7 @@ public class ApplicationMasterInfo imple
   private final AllocateTransition allocateTransition = new AllocateTransition();
   private final LaunchTransition launchTransition =  new LaunchTransition();
   private final LaunchedTransition launchedTransition = new LaunchedTransition();
+  private final FailedLaunchTransition failedLaunchTransition = new FailedLaunchTransition();
   
   private final StateMachine<ApplicationState, ApplicationEventType, 
   ASMEvent<ApplicationEventType>> stateMachine;
@@ -133,6 +134,12 @@ public class ApplicationMasterInfo imple
   .addTransition(ApplicationState.LAUNCHING, ApplicationState.LAUNCHED,
   ApplicationEventType.LAUNCHED, launchedTransition)
   
+  .addTransition(ApplicationState.LAUNCHING, ApplicationState.PENDING,
+  ApplicationEventType.LAUNCH_FAILED, failedLaunchTransition)
+  
+  .addTransition(ApplicationState.PENDING, ApplicationState.ALLOCATING, 
+  ApplicationEventType.RELEASED, new ScheduleTransition())
+  
   /** we cant say if the application was launched or not on a recovery, so for now 
    * we assume it was launched and wait for its restart.
    */
@@ -326,6 +333,16 @@ public class ApplicationMasterInfo imple
     }
   }
   
+  private static class FailedLaunchTransition implements 
+  SingleArcTransition<ApplicationMasterInfo, ASMEvent<ApplicationEventType>> {
+    @Override
+    public void transition(ApplicationMasterInfo masterInfo,
+    ASMEvent<ApplicationEventType> event) {
+      masterInfo.handler.handle(new ASMEvent<SNEventType>(
+      SNEventType.RELEASE, masterInfo));
+    }
+  }
+  
   private static class LaunchTransition implements
   SingleArcTransition<ApplicationMasterInfo, ASMEvent<ApplicationEventType>> {
     @Override
@@ -389,6 +406,19 @@ public class ApplicationMasterInfo imple
   }
 
 
+  /* Transition to schedule again on a container launch failure for AM */
+  private static class ScheduleTransition implements 
+  SingleArcTransition<ApplicationMasterInfo, ASMEvent<ApplicationEventType>> {
+    @Override
+    public void transition(ApplicationMasterInfo masterInfo,
+    ASMEvent<ApplicationEventType> event) {
+      masterInfo.masterContainer = null;
+      /* schedule for a slot */
+      masterInfo.handler.handle(new ASMEvent<SNEventType>(SNEventType.SCHEDULE,
+      masterInfo));
+    }
+  }
+  
   /* Transition to start the process of allocating for the AM container */
   private static class AllocateTransition implements 
   SingleArcTransition<ApplicationMasterInfo, ASMEvent<ApplicationEventType>> {

Modified: hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/SchedulerNegotiator.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/SchedulerNegotiator.java?rev=1104647&r1=1104646&r2=1104647&view=diff
==============================================================================
--- hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/SchedulerNegotiator.java (original)
+++ hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/SchedulerNegotiator.java Wed May 18 01:04:31 2011
@@ -202,6 +202,17 @@ class SchedulerNegotiator extends Abstra
     case SCHEDULE:
       addPending(appContext);
       break;
+    case RELEASE:
+      try {
+      scheduler.allocate(appContext.getApplicationID(), 
+          EMPTY_ASK, Collections.singletonList(appContext.getMasterContainer()));
+      } catch(IOException ie) {
+        //TODO remove IOException from the scheduler.
+        LOG.error("Error while releasing container for AM " + appContext.getApplicationID());
+      }
+      handler.handle(new ASMEvent<ApplicationEventType>(ApplicationEventType.RELEASED, 
+          appContext));
+      break;
     case CLEANUP:
       try {
         finishApplication(appContext);

Modified: hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/events/ApplicationMasterEvents.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/events/ApplicationMasterEvents.java?rev=1104647&r1=1104646&r2=1104647&view=diff
==============================================================================
--- hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/events/ApplicationMasterEvents.java (original)
+++ hadoop/mapreduce/branches/MR-279/yarn/yarn-server/yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/events/ApplicationMasterEvents.java Wed May 18 01:04:31 2011
@@ -30,6 +30,7 @@ import org.apache.hadoop.classification.
 public class ApplicationMasterEvents {
   public enum SNEventType {
     SCHEDULE,
+    RELEASE,
     CLEANUP
   };
 
@@ -48,10 +49,12 @@ public class ApplicationMasterEvents {
   public enum ApplicationEventType {
     ALLOCATE,
     REGISTERED,
+    RELEASED,
     RECOVER,
     REMOVE,
     STATUSUPDATE,
     LAUNCH,
+    LAUNCH_FAILED,
     LAUNCHED,
     FAILED,
     FAILED_MAX_RETRIES,