You are viewing a plain text version of this content. The canonical link for it is here.
Posted to yarn-commits@hadoop.apache.org by vi...@apache.org on 2014/06/24 17:15:12 UTC
svn commit: r1605106 - in /hadoop/common/trunk/hadoop-yarn-project: ./
hadoop-yarn/hadoop-yarn-api/src/main/proto/server/
hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/reco...
Author: vinodkv
Date: Tue Jun 24 15:15:12 2014
New Revision: 1605106
URL: http://svn.apache.org/r1605106
Log:
YARN-2074. Changed ResourceManager to not count AM preemptions towards app failures. Contributed by Jian He.
Modified:
hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/server/yarn_server_resourcemanager_service_protos.proto
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/MemoryRMStateStore.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/ApplicationAttemptStateData.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/impl/pb/ApplicationAttemptStateDataPBImpl.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttempt.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/TestAMRestart.java
hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreTestBase.java
Modified: hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt (original)
+++ hadoop/common/trunk/hadoop-yarn-project/CHANGES.txt Tue Jun 24 15:15:12 2014
@@ -175,6 +175,9 @@ Release 2.5.0 - UNRELEASED
YARN-2195. Clean a piece of code in ResourceRequest. (Wei Yan via devaraj)
+ YARN-2074. Changed ResourceManager to not count AM preemptions towards app
+ failures. (Jian He via vinodkv)
+
OPTIMIZATIONS
BUG FIXES
Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/server/yarn_server_resourcemanager_service_protos.proto
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/server/yarn_server_resourcemanager_service_protos.proto?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/server/yarn_server_resourcemanager_service_protos.proto (original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/server/yarn_server_resourcemanager_service_protos.proto Tue Jun 24 15:15:12 2014
@@ -127,6 +127,7 @@ message ApplicationAttemptStateDataProto
optional string diagnostics = 6 [default = "N/A"];
optional int64 start_time = 7;
optional FinalApplicationStatusProto final_application_status = 8;
+ optional int32 am_container_exit_status = 9 [default = -1000];
}
message RMStateVersionProto {
Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java (original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java Tue Jun 24 15:15:12 2014
@@ -216,7 +216,8 @@ public class FileSystemRMStateStore exte
attemptStateData.getState(),
attemptStateData.getFinalTrackingUrl(),
attemptStateData.getDiagnostics(),
- attemptStateData.getFinalApplicationStatus());
+ attemptStateData.getFinalApplicationStatus(),
+ attemptStateData.getAMContainerExitStatus());
// assert child node name is same as application attempt id
assert attemptId.equals(attemptState.getAttemptId());
Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/MemoryRMStateStore.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/MemoryRMStateStore.java?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/MemoryRMStateStore.java (original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/MemoryRMStateStore.java Tue Jun 24 15:15:12 2014
@@ -152,7 +152,8 @@ public class MemoryRMStateStore extends
attemptStateData.getStartTime(), attemptStateData.getState(),
attemptStateData.getFinalTrackingUrl(),
attemptStateData.getDiagnostics(),
- attemptStateData.getFinalApplicationStatus());
+ attemptStateData.getFinalApplicationStatus(),
+ attemptStateData.getAMContainerExitStatus());
ApplicationState appState =
state.getApplicationState().get(
Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java (original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java Tue Jun 24 15:15:12 2014
@@ -39,6 +39,7 @@ import org.apache.hadoop.yarn.api.record
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.Container;
+import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl;
import org.apache.hadoop.yarn.event.AsyncDispatcher;
@@ -258,19 +259,21 @@ public abstract class RMStateStore exten
RMAppAttemptState state;
String finalTrackingUrl = "N/A";
String diagnostics;
+ int exitStatus = ContainerExitStatus.INVALID;
FinalApplicationStatus amUnregisteredFinalStatus;
public ApplicationAttemptState(ApplicationAttemptId attemptId,
Container masterContainer, Credentials appAttemptCredentials,
long startTime) {
this(attemptId, masterContainer, appAttemptCredentials, startTime, null,
- null, "", null);
+ null, "", null, ContainerExitStatus.INVALID);
}
public ApplicationAttemptState(ApplicationAttemptId attemptId,
Container masterContainer, Credentials appAttemptCredentials,
long startTime, RMAppAttemptState state, String finalTrackingUrl,
- String diagnostics, FinalApplicationStatus amUnregisteredFinalStatus) {
+ String diagnostics, FinalApplicationStatus amUnregisteredFinalStatus,
+ int exitStatus) {
this.attemptId = attemptId;
this.masterContainer = masterContainer;
this.appAttemptCredentials = appAttemptCredentials;
@@ -279,6 +282,7 @@ public abstract class RMStateStore exten
this.finalTrackingUrl = finalTrackingUrl;
this.diagnostics = diagnostics == null ? "" : diagnostics;
this.amUnregisteredFinalStatus = amUnregisteredFinalStatus;
+ this.exitStatus = exitStatus;
}
public Container getMasterContainer() {
@@ -305,6 +309,9 @@ public abstract class RMStateStore exten
public FinalApplicationStatus getFinalApplicationStatus() {
return amUnregisteredFinalStatus;
}
+ public int getAMContainerExitStatus(){
+ return this.exitStatus;
+ }
}
/**
Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java (original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java Tue Jun 24 15:15:12 2014
@@ -538,12 +538,12 @@ public class ZKRMStateStore extends RMSt
ApplicationAttemptState attemptState =
new ApplicationAttemptState(attemptId,
- attemptStateData.getMasterContainer(), credentials,
- attemptStateData.getStartTime(),
- attemptStateData.getState(),
- attemptStateData.getFinalTrackingUrl(),
- attemptStateData.getDiagnostics(),
- attemptStateData.getFinalApplicationStatus());
+ attemptStateData.getMasterContainer(), credentials,
+ attemptStateData.getStartTime(), attemptStateData.getState(),
+ attemptStateData.getFinalTrackingUrl(),
+ attemptStateData.getDiagnostics(),
+ attemptStateData.getFinalApplicationStatus(),
+ attemptStateData.getAMContainerExitStatus());
appState.attempts.put(attemptState.getAttemptId(), attemptState);
}
Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/ApplicationAttemptStateData.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/ApplicationAttemptStateData.java?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/ApplicationAttemptStateData.java (original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/ApplicationAttemptStateData.java Tue Jun 24 15:15:12 2014
@@ -43,7 +43,7 @@ public abstract class ApplicationAttempt
ApplicationAttemptId attemptId, Container container,
ByteBuffer attemptTokens, long startTime, RMAppAttemptState finalState,
String finalTrackingUrl, String diagnostics,
- FinalApplicationStatus amUnregisteredFinalStatus) {
+ FinalApplicationStatus amUnregisteredFinalStatus, int exitStatus) {
ApplicationAttemptStateData attemptStateData =
Records.newRecord(ApplicationAttemptStateData.class);
attemptStateData.setAttemptId(attemptId);
@@ -54,6 +54,7 @@ public abstract class ApplicationAttempt
attemptStateData.setDiagnostics(diagnostics);
attemptStateData.setStartTime(startTime);
attemptStateData.setFinalApplicationStatus(amUnregisteredFinalStatus);
+ attemptStateData.setAMContainerExitStatus(exitStatus);
return attemptStateData;
}
@@ -67,11 +68,11 @@ public abstract class ApplicationAttempt
appAttemptTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
}
return newInstance(attemptState.getAttemptId(),
- attemptState.getMasterContainer(), appAttemptTokens,
- attemptState.getStartTime(), attemptState.getState(),
- attemptState.getFinalTrackingUrl(),
- attemptState.getDiagnostics(),
- attemptState.getFinalApplicationStatus());
+ attemptState.getMasterContainer(), appAttemptTokens,
+ attemptState.getStartTime(), attemptState.getState(),
+ attemptState.getFinalTrackingUrl(), attemptState.getDiagnostics(),
+ attemptState.getFinalApplicationStatus(),
+ attemptState.getAMContainerExitStatus());
}
public abstract ApplicationAttemptStateDataProto getProto();
@@ -150,5 +151,10 @@ public abstract class ApplicationAttempt
*/
public abstract FinalApplicationStatus getFinalApplicationStatus();
- public abstract void setFinalApplicationStatus(FinalApplicationStatus finishState);
+ public abstract void setFinalApplicationStatus(
+ FinalApplicationStatus finishState);
+
+ public abstract int getAMContainerExitStatus();
+
+ public abstract void setAMContainerExitStatus(int exitStatus);
}
Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/impl/pb/ApplicationAttemptStateDataPBImpl.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/impl/pb/ApplicationAttemptStateDataPBImpl.java?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/impl/pb/ApplicationAttemptStateDataPBImpl.java (original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/impl/pb/ApplicationAttemptStateDataPBImpl.java Tue Jun 24 15:15:12 2014
@@ -253,6 +253,19 @@ public class ApplicationAttemptStateData
}
@Override
+ public int getAMContainerExitStatus() {
+ ApplicationAttemptStateDataProtoOrBuilder p = viaProto ? proto : builder;
+ return p.getAmContainerExitStatus();
+ }
+
+ @Override
+ public void setAMContainerExitStatus(int exitStatus) {
+ maybeInitBuilder();
+ builder.setAmContainerExitStatus(exitStatus);
+ }
+
+
+ @Override
public boolean equals(Object other) {
if (other == null)
return false;
@@ -281,5 +294,4 @@ public class ApplicationAttemptStateData
private FinalApplicationStatus convertFromProtoFormat(FinalApplicationStatusProto s) {
return ProtoUtils.convertFromProtoFormat(s);
}
-
}
Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java (original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java Tue Jun 24 15:15:12 2014
@@ -78,7 +78,6 @@ import org.apache.hadoop.yarn.state.Mult
import org.apache.hadoop.yarn.state.SingleArcTransition;
import org.apache.hadoop.yarn.state.StateMachine;
import org.apache.hadoop.yarn.state.StateMachineFactory;
-import org.apache.hadoop.yarn.util.resource.Resources;
@SuppressWarnings({ "rawtypes", "unchecked" })
public class RMAppImpl implements RMApp, Recoverable {
@@ -686,7 +685,11 @@ public class RMAppImpl implements RMApp,
ApplicationAttemptId.newInstance(applicationId, attempts.size() + 1);
RMAppAttempt attempt =
new RMAppAttemptImpl(appAttemptId, rmContext, scheduler, masterService,
- submissionContext, conf, maxAppAttempts == attempts.size());
+ submissionContext, conf,
+ // The newly created attempt maybe last attempt if (number of
+ // previously NonPreempted attempts + 1) equal to the max-attempt
+ // limit.
+ maxAppAttempts == (getNumNonPreemptedAppAttempts() + 1));
attempts.put(appAttemptId, attempt);
currentAttempt = attempt;
}
@@ -794,7 +797,7 @@ public class RMAppImpl implements RMApp,
&& (app.currentAttempt.getState() == RMAppAttemptState.KILLED
|| app.currentAttempt.getState() == RMAppAttemptState.FINISHED
|| (app.currentAttempt.getState() == RMAppAttemptState.FAILED
- && app.attempts.size() == app.maxAppAttempts))) {
+ && app.getNumNonPreemptedAppAttempts() == app.maxAppAttempts))) {
return RMAppState.ACCEPTED;
}
@@ -885,7 +888,7 @@ public class RMAppImpl implements RMApp,
msg = "Unmanaged application " + this.getApplicationId()
+ " failed due to " + failedEvent.getDiagnostics()
+ ". Failing the application.";
- } else if (this.attempts.size() >= this.maxAppAttempts) {
+ } else if (getNumNonPreemptedAppAttempts() >= this.maxAppAttempts) {
msg = "Application " + this.getApplicationId() + " failed "
+ this.maxAppAttempts + " times due to "
+ failedEvent.getDiagnostics() + ". Failing the application.";
@@ -1102,6 +1105,17 @@ public class RMAppImpl implements RMApp,
};
}
+ private int getNumNonPreemptedAppAttempts() {
+ int completedAttempts = 0;
+ // Do not count AM preemption as attempt failure.
+ for (RMAppAttempt attempt : attempts.values()) {
+ if (!attempt.isPreempted()) {
+ completedAttempts++;
+ }
+ }
+ return completedAttempts;
+ }
+
private static final class AttemptFailedTransition implements
MultipleArcTransition<RMAppImpl, RMAppEvent, RMAppState> {
@@ -1113,8 +1127,9 @@ public class RMAppImpl implements RMApp,
@Override
public RMAppState transition(RMAppImpl app, RMAppEvent event) {
+
if (!app.submissionContext.getUnmanagedAM()
- && app.attempts.size() < app.maxAppAttempts) {
+ && app.getNumNonPreemptedAppAttempts() < app.maxAppAttempts) {
boolean transferStateFromPreviousAttempt = false;
RMAppFailedAttemptEvent failedEvent = (RMAppFailedAttemptEvent) event;
transferStateFromPreviousAttempt =
Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttempt.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttempt.java?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttempt.java (original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttempt.java Tue Jun 24 15:15:12 2014
@@ -196,4 +196,9 @@ public interface RMAppAttempt extends Ev
*/
ApplicationAttemptReport createApplicationAttemptReport();
+ /**
+ * Return the flag which indicates whether the attempt is preempted by the
+ * scheduler.
+ */
+ boolean isPreempted();
}
Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java (original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java Tue Jun 24 15:15:12 2014
@@ -48,11 +48,11 @@ import org.apache.hadoop.yarn.api.record
import org.apache.hadoop.yarn.api.records.ApplicationResourceUsageReport;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.Container;
+import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.Priority;
-import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.api.records.YarnApplicationAttemptState;
import org.apache.hadoop.yarn.event.EventHandler;
@@ -146,9 +146,14 @@ public class RMAppAttemptImpl implements
// if an RMAppAttemptUnregistrationEvent occurs
private FinalApplicationStatus finalStatus = null;
private final StringBuilder diagnostics = new StringBuilder();
+ private int amContainerExitStatus = ContainerExitStatus.INVALID;
private Configuration conf;
- private final boolean isLastAttempt;
+ // Since AM preemption is not counted towards AM failure count,
+ // even if this flag is true, a new attempt can still be re-created if this
+ // attempt is eventually preempted. So this flag indicates that this may be
+ // last attempt.
+ private final boolean maybeLastAttempt;
private static final ExpiredTransition EXPIRED_TRANSITION =
new ExpiredTransition();
@@ -389,7 +394,7 @@ public class RMAppAttemptImpl implements
RMContext rmContext, YarnScheduler scheduler,
ApplicationMasterService masterService,
ApplicationSubmissionContext submissionContext,
- Configuration conf, boolean isLastAttempt) {
+ Configuration conf, boolean maybeLastAttempt) {
this.conf = conf;
this.applicationAttemptId = appAttemptId;
this.rmContext = rmContext;
@@ -403,7 +408,7 @@ public class RMAppAttemptImpl implements
this.writeLock = lock.writeLock();
this.proxiedTrackingUrl = generateProxyUriWithScheme(null);
- this.isLastAttempt = isLastAttempt;
+ this.maybeLastAttempt = maybeLastAttempt;
this.stateMachine = stateMachineFactory.make(this);
}
@@ -565,6 +570,15 @@ public class RMAppAttemptImpl implements
}
}
+ public int getAMContainerExitStatus() {
+ this.readLock.lock();
+ try {
+ return this.amContainerExitStatus;
+ } finally {
+ this.readLock.unlock();
+ }
+ }
+
@Override
public float getProgress() {
this.readLock.lock();
@@ -671,6 +685,7 @@ public class RMAppAttemptImpl implements
+ attemptState.getState());
diagnostics.append("Attempt recovered after RM restart");
diagnostics.append(attemptState.getDiagnostics());
+ this.amContainerExitStatus = attemptState.getAMContainerExitStatus();
setMasterContainer(attemptState.getMasterContainer());
recoverAppAttemptCredentials(attemptState.getAppAttemptCredentials());
this.recoveredFinalState = attemptState.getState();
@@ -931,7 +946,7 @@ public class RMAppAttemptImpl implements
String diags = null;
String finalTrackingUrl = null;
FinalApplicationStatus finalStatus = null;
-
+ int exitStatus = ContainerExitStatus.INVALID;
switch (event.getType()) {
case LAUNCH_FAILED:
RMAppAttemptLaunchFailedEvent launchFaileEvent =
@@ -952,6 +967,7 @@ public class RMAppAttemptImpl implements
RMAppAttemptContainerFinishedEvent finishEvent =
(RMAppAttemptContainerFinishedEvent) event;
diags = getAMContainerCrashedDiagnostics(finishEvent);
+ exitStatus = finishEvent.getContainerStatus().getExitStatus();
break;
case KILL:
break;
@@ -966,9 +982,10 @@ public class RMAppAttemptImpl implements
ApplicationAttemptState attemptState =
new ApplicationAttemptState(applicationAttemptId, getMasterContainer(),
rmStore.getCredentialsFromAppAttempt(this), startTime,
- stateToBeStored, finalTrackingUrl, diags, finalStatus);
+ stateToBeStored, finalTrackingUrl, diags, finalStatus, exitStatus);
LOG.info("Updating application attempt " + applicationAttemptId
- + " with final state: " + targetedFinalState);
+ + " with final state: " + targetedFinalState + ", and exit status: "
+ + exitStatus);
rmStore.updateApplicationAttemptState(attemptState);
}
@@ -1061,11 +1078,19 @@ public class RMAppAttemptImpl implements
// don't leave the tracking URL pointing to a non-existent AM
appAttempt.setTrackingUrlToRMAppPage();
appAttempt.invalidateAMHostAndPort();
+
if (appAttempt.submissionContext
.getKeepContainersAcrossApplicationAttempts()
- && !appAttempt.isLastAttempt
&& !appAttempt.submissionContext.getUnmanagedAM()) {
- keepContainersAcrossAppAttempts = true;
+ // See if we should retain containers for non-unmanaged applications
+ if (appAttempt.isPreempted()) {
+ // Premption doesn't count towards app-failures and so we should
+ // retain containers.
+ keepContainersAcrossAppAttempts = true;
+ } else if (!appAttempt.maybeLastAttempt) {
+ // Not preemption. Not last-attempt too - keep containers.
+ keepContainersAcrossAppAttempts = true;
+ }
}
appEvent =
new RMAppFailedAttemptEvent(applicationId,
@@ -1105,7 +1130,12 @@ public class RMAppAttemptImpl implements
appAttempt.getClientTokenMasterKey());
}
}
-
+
+ @Override
+ public boolean isPreempted() {
+ return getAMContainerExitStatus() == ContainerExitStatus.PREEMPTED;
+ }
+
private static final class UnmanagedAMAttemptSavedTransition
extends AMLaunchedTransition {
@Override
@@ -1208,14 +1238,22 @@ public class RMAppAttemptImpl implements
appAttempt.rmContext.getAMLivelinessMonitor().unregister(
appAttempt.getAppAttemptId());
- // Setup diagnostic message
- appAttempt.diagnostics
- .append(getAMContainerCrashedDiagnostics(finishEvent));
+ // Setup diagnostic message and exit status
+ appAttempt.setAMContainerCrashedDiagnosticsAndExitStatus(finishEvent);
+
// Tell the app, scheduler
super.transition(appAttempt, finishEvent);
}
}
+ private void setAMContainerCrashedDiagnosticsAndExitStatus(
+ RMAppAttemptContainerFinishedEvent finishEvent) {
+ ContainerStatus status = finishEvent.getContainerStatus();
+ String diagnostics = getAMContainerCrashedDiagnostics(finishEvent);
+ this.diagnostics.append(diagnostics);
+ this.amContainerExitStatus = status.getExitStatus();
+ }
+
private static String getAMContainerCrashedDiagnostics(
RMAppAttemptContainerFinishedEvent finishEvent) {
ContainerStatus status = finishEvent.getContainerStatus();
@@ -1437,13 +1475,12 @@ public class RMAppAttemptImpl implements
@Override
public void
transition(RMAppAttemptImpl appAttempt, RMAppAttemptEvent event) {
- RMAppAttemptContainerFinishedEvent containerFinishedEvent =
+ RMAppAttemptContainerFinishedEvent finishEvent =
(RMAppAttemptContainerFinishedEvent) event;
// container associated with AM. must not be unmanaged
assert appAttempt.submissionContext.getUnmanagedAM() == false;
- // Setup diagnostic message
- appAttempt.diagnostics
- .append(getAMContainerCrashedDiagnostics(containerFinishedEvent));
+ // Setup diagnostic message and exit status
+ appAttempt.setAMContainerCrashedDiagnosticsAndExitStatus(finishEvent);
new FinalTransition(RMAppAttemptState.FAILED).transition(appAttempt,
event);
}
@@ -1644,4 +1681,9 @@ public class RMAppAttemptImpl implements
}
return attemptReport;
}
+
+ // for testing
+ public boolean mayBeLastAttempt() {
+ return maybeLastAttempt;
+ }
}
Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java (original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java Tue Jun 24 15:15:12 2014
@@ -1076,14 +1076,12 @@ public class CapacityScheduler extends
@Override
public void killContainer(RMContainer cont) {
- if(LOG.isDebugEnabled()){
+ if (LOG.isDebugEnabled()) {
LOG.debug("KILL_CONTAINER: container" + cont.toString());
}
- completedContainer(cont,
- SchedulerUtils.createPreemptedContainerStatus(
- cont.getContainerId(),"Container being forcibly preempted:"
- + cont.getContainerId()),
- RMContainerEventType.KILL);
+ completedContainer(cont, SchedulerUtils.createPreemptedContainerStatus(
+ cont.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER),
+ RMContainerEventType.KILL);
}
@Override
Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java (original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java Tue Jun 24 15:15:12 2014
@@ -87,7 +87,7 @@ public class MockNM {
return httpPort;
}
- void setResourceTrackerService(ResourceTrackerService resourceTracker) {
+ public void setResourceTrackerService(ResourceTrackerService resourceTracker) {
this.resourceTracker = resourceTracker;
}
Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java (original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java Tue Jun 24 15:15:12 2014
@@ -166,6 +166,19 @@ public class MockRM extends ResourceMana
}
}
+ public MockAM waitForNewAMToLaunchAndRegister(ApplicationId appId, int attemptSize,
+ MockNM nm) throws Exception {
+ RMApp app = getRMContext().getRMApps().get(appId);
+ Assert.assertNotNull(app);
+ while (app.getAppAttempts().size() != attemptSize) {
+ System.out.println("Application " + appId
+ + " is waiting for AM to restart. Current has "
+ + app.getAppAttempts().size() + " attempts.");
+ Thread.sleep(200);
+ }
+ return launchAndRegisterAM(app, this, nm);
+ }
+
public void waitForState(MockNM nm, ContainerId containerId,
RMContainerState containerState) throws Exception {
RMContainer container = getResourceScheduler().getRMContainer(containerId);
@@ -551,6 +564,7 @@ public class MockRM extends ResourceMana
throws Exception {
rm.waitForState(app.getApplicationId(), RMAppState.ACCEPTED);
RMAppAttempt attempt = app.getCurrentAppAttempt();
+ System.out.println("Launch AM " + attempt.getAppAttemptId());
nm.nodeHeartbeat(true);
MockAM am = rm.sendAMLaunched(attempt.getAppAttemptId());
rm.waitForState(attempt.getAppAttemptId(), RMAppAttemptState.LAUNCHED);
Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/TestAMRestart.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/TestAMRestart.java?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/TestAMRestart.java (original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/TestAMRestart.java Tue Jun 24 15:15:12 2014
@@ -22,13 +22,12 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
-import org.junit.Assert;
-
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.api.records.ApplicationAccessType;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.Container;
+import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
@@ -38,22 +37,23 @@ import org.apache.hadoop.yarn.conf.YarnC
import org.apache.hadoop.yarn.server.resourcemanager.MockAM;
import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
+import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore;
+import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.ApplicationState;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
+import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
+import org.junit.Assert;
import org.junit.Test;
-/**
- * Test to restart the AM on failure.
- *
- */
public class TestAMRestart {
- @Test
+ @Test(timeout = 30000)
public void testAMRestartWithExistingContainers() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 2);
@@ -245,7 +245,7 @@ public class TestAMRestart {
}
}
- @Test
+ @Test(timeout = 30000)
public void testNMTokensRebindOnAMRestart() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 3);
@@ -345,4 +345,122 @@ public class TestAMRestart {
Assert.assertTrue(transferredTokens.containsAll(expectedNMTokens));
rm1.stop();
}
+
+ // AM container preempted should not be counted towards AM max retry count.
+ @Test(timeout = 20000)
+ public void testAMPreemptedNotCountedForAMFailures() throws Exception {
+ YarnConfiguration conf = new YarnConfiguration();
+ conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class,
+ ResourceScheduler.class);
+ // explicitly set max-am-retry count as 1.
+ conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
+ MockRM rm1 = new MockRM(conf);
+ rm1.start();
+ MockNM nm1 =
+ new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
+ nm1.registerNode();
+ RMApp app1 = rm1.submitApp(200);
+ RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
+ MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+ CapacityScheduler scheduler =
+ (CapacityScheduler) rm1.getResourceScheduler();
+ ContainerId amContainer =
+ ContainerId.newInstance(am1.getApplicationAttemptId(), 1);
+ // Preempt the first attempt;
+ scheduler.killContainer(scheduler.getRMContainer(amContainer));
+
+ am1.waitForState(RMAppAttemptState.FAILED);
+ Assert.assertTrue(attempt1.isPreempted());
+ rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
+ // AM should be restarted even though max-am-attempt is 1.
+ MockAM am2 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+ RMAppAttempt attempt2 = app1.getCurrentAppAttempt();
+ Assert.assertTrue(((RMAppAttemptImpl) attempt2).mayBeLastAttempt());
+
+ // Preempt the second attempt.
+ ContainerId amContainer2 =
+ ContainerId.newInstance(am2.getApplicationAttemptId(), 1);
+ scheduler.killContainer(scheduler.getRMContainer(amContainer2));
+
+ am2.waitForState(RMAppAttemptState.FAILED);
+ Assert.assertTrue(attempt2.isPreempted());
+ rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
+ MockAM am3 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+ RMAppAttempt attempt3 = app1.getCurrentAppAttempt();
+ Assert.assertTrue(((RMAppAttemptImpl) attempt3).mayBeLastAttempt());
+
+ // fail the AM normally
+ nm1.nodeHeartbeat(am3.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
+ am3.waitForState(RMAppAttemptState.FAILED);
+ Assert.assertFalse(attempt3.isPreempted());
+
+ // AM should not be restarted.
+ rm1.waitForState(app1.getApplicationId(), RMAppState.FAILED);
+ Assert.assertEquals(3, app1.getAppAttempts().size());
+ rm1.stop();
+ }
+
+ // Test RM restarts after AM container is preempted, new RM should not count
+ // AM preemption failure towards the max-retry-account and should be able to
+ // re-launch the AM.
+ @Test(timeout = 20000)
+ public void testPreemptedAMRestartOnRMRestart() throws Exception {
+ YarnConfiguration conf = new YarnConfiguration();
+ conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class,
+ ResourceScheduler.class);
+ conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
+ conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
+ // explicitly set max-am-retry count as 1.
+ conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
+ MemoryRMStateStore memStore = new MemoryRMStateStore();
+ memStore.init(conf);
+
+ MockRM rm1 = new MockRM(conf, memStore);
+ rm1.start();
+ MockNM nm1 =
+ new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
+ nm1.registerNode();
+ RMApp app1 = rm1.submitApp(200);
+ RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
+ MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+ CapacityScheduler scheduler =
+ (CapacityScheduler) rm1.getResourceScheduler();
+ ContainerId amContainer =
+ ContainerId.newInstance(am1.getApplicationAttemptId(), 1);
+
+ // Forcibly preempt the am container;
+ scheduler.killContainer(scheduler.getRMContainer(amContainer));
+
+ am1.waitForState(RMAppAttemptState.FAILED);
+ Assert.assertTrue(attempt1.isPreempted());
+ rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
+
+ // state store has 1 attempt stored.
+ ApplicationState appState =
+ memStore.getState().getApplicationState().get(app1.getApplicationId());
+ Assert.assertEquals(1, appState.getAttemptCount());
+ // attempt stored has the preempted container exit status.
+ Assert.assertEquals(ContainerExitStatus.PREEMPTED,
+ appState.getAttempt(am1.getApplicationAttemptId())
+ .getAMContainerExitStatus());
+ // Restart rm.
+ MockRM rm2 = new MockRM(conf, memStore);
+ nm1.setResourceTrackerService(rm2.getResourceTrackerService());
+ nm1.registerNode();
+ rm2.start();
+
+ // Restarted RM should re-launch the am.
+ MockAM am2 =
+ rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1);
+ MockRM.finishAMAndVerifyAppState(app1, rm2, nm1, am2);
+ RMAppAttempt attempt2 =
+ rm2.getRMContext().getRMApps().get(app1.getApplicationId())
+ .getCurrentAppAttempt();
+ Assert.assertFalse(attempt2.isPreempted());
+ Assert.assertEquals(ContainerExitStatus.INVALID,
+ appState.getAttempt(am2.getApplicationAttemptId())
+ .getAMContainerExitStatus());
+ rm1.stop();
+ rm2.stop();
+ }
}
Modified: hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreTestBase.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreTestBase.java?rev=1605106&r1=1605105&r2=1605106&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreTestBase.java (original)
+++ hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreTestBase.java Tue Jun 24 15:15:12 2014
@@ -267,6 +267,7 @@ public class RMStateStoreTestBase extend
// attempt1 is loaded correctly
assertNotNull(attemptState);
assertEquals(attemptId1, attemptState.getAttemptId());
+ assertEquals(-1000, attemptState.getAMContainerExitStatus());
// attempt1 container is loaded correctly
assertEquals(containerId1, attemptState.getMasterContainer().getId());
// attempt1 applicationToken is loaded correctly
@@ -308,7 +309,7 @@ public class RMStateStoreTestBase extend
oldAttemptState.getAppAttemptCredentials(),
oldAttemptState.getStartTime(), RMAppAttemptState.FINISHED,
"myTrackingUrl", "attemptDiagnostics",
- FinalApplicationStatus.SUCCEEDED);
+ FinalApplicationStatus.SUCCEEDED, 100);
store.updateApplicationAttemptState(newAttemptState);
// test updating the state of an app/attempt whose initial state was not
@@ -331,7 +332,7 @@ public class RMStateStoreTestBase extend
oldAttemptState.getAppAttemptCredentials(),
oldAttemptState.getStartTime(), RMAppAttemptState.FINISHED,
"myTrackingUrl", "attemptDiagnostics",
- FinalApplicationStatus.SUCCEEDED);
+ FinalApplicationStatus.SUCCEEDED, 111);
store.updateApplicationAttemptState(dummyAttempt);
// let things settle down
@@ -370,6 +371,7 @@ public class RMStateStoreTestBase extend
assertEquals(RMAppAttemptState.FINISHED, updatedAttemptState.getState());
assertEquals("myTrackingUrl", updatedAttemptState.getFinalTrackingUrl());
assertEquals("attemptDiagnostics", updatedAttemptState.getDiagnostics());
+ assertEquals(100, updatedAttemptState.getAMContainerExitStatus());
assertEquals(FinalApplicationStatus.SUCCEEDED,
updatedAttemptState.getFinalApplicationStatus());