You are viewing a plain text version of this content. The canonical link for it is here.
Posted to yarn-issues@hadoop.apache.org by "Jian He (JIRA)" <ji...@apache.org> on 2015/09/17 12:04:45 UTC
[jira] [Comment Edited] (YARN-4000) RM crashes with NPE if leaf
queue becomes parent queue during restart
[ https://issues.apache.org/jira/browse/YARN-4000?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14802698#comment-14802698 ]
Jian He edited comment on YARN-4000 at 9/17/15 10:04 AM:
---------------------------------------------------------
- is this if condition a typo ?
{code}
if (event.getDiagnosticMsg().isEmpty())
app.appDiagnosticsBeforeKilling =
event.getDiagnosticMsg().isEmpty() ? getAppKilledDiagnostics() : event.getDiagnosticMsg();
{code}
Instead of introducing the appDiagnosticsBeforeKilling filed in RMAppImpl, I suggest doing below changes in RMAppImpl and RMAppAttemptImpl, the idea is to send the diagnostics from app to attempt and let attempt send it back.
{code}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
index ea9aa70..dc46326 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
@@ -1112,7 +1112,7 @@ private void rememberTargetTransitionsAndStoreState(RMAppEvent event,
diags = getAppAttemptFailedDiagnostics(failedEvent);
break;
case ATTEMPT_KILLED:
- diags = getAppKilledDiagnostics();
+ diags = event.getDiagnostics();
break;
default:
break;
@@ -1209,21 +1209,17 @@ public AppKilledTransition() {
@Override
public void transition(RMAppImpl app, RMAppEvent event) {
- app.diagnostics.append(getAppKilledDiagnostics());
+ app.diagnostics.append(event.getDiagnostics());
super.transition(app, event);
};
}
- private static String getAppKilledDiagnostics() {
- return "Application killed by user.";
- }
-
private static class KillAttemptTransition extends RMAppTransition {
@Override
public void transition(RMAppImpl app, RMAppEvent event) {
app.stateBeforeKilling = app.getState();
app.handler.handle(new RMAppAttemptEvent(app.currentAttempt
- .getAppAttemptId(), RMAppAttemptEventType.KILL));
+ .getAppAttemptId(), RMAppAttemptEventType.KILL, event.getDiagnostics()));
}
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java
index 629b2a3..d4f254e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java
@@ -1270,8 +1270,7 @@ public void transition(RMAppAttemptImpl appAttempt,
appAttempt.invalidateAMHostAndPort();
appEvent =
new RMAppFailedAttemptEvent(applicationId,
- RMAppEventType.ATTEMPT_KILLED,
- "Application killed by user.", false);
+ RMAppEventType.ATTEMPT_KILLED, event.getDiagnostics(), false);
}
break;
case FAILED:
{code}
- random sleep may be flicky, use {{MockRM#waitForState(ApplicationId appId, RMAppState finalState)}} instead
{code}
// Wait for app and attempt to be killed.
Thread.sleep(1000);
{code}
was (Author: jianhe):
- is this if condition a typo ?
{code}
if (event.getDiagnosticMsg().isEmpty())
app.appDiagnosticsBeforeKilling =
event.getDiagnosticMsg().isEmpty() ? getAppKilledDiagnostics() : event.getDiagnosticMsg();
{code}
Instead of introducing the appDiagnosticsBeforeKilling filed in RMAppImpl, I suggest doing below changes in RMAppImpl and RMAppAttemptImpl
{code}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
index ea9aa70..dc46326 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
@@ -1112,7 +1112,7 @@ private void rememberTargetTransitionsAndStoreState(RMAppEvent event,
diags = getAppAttemptFailedDiagnostics(failedEvent);
break;
case ATTEMPT_KILLED:
- diags = getAppKilledDiagnostics();
+ diags = event.getDiagnostics();
break;
default:
break;
@@ -1209,21 +1209,17 @@ public AppKilledTransition() {
@Override
public void transition(RMAppImpl app, RMAppEvent event) {
- app.diagnostics.append(getAppKilledDiagnostics());
+ app.diagnostics.append(event.getDiagnostics());
super.transition(app, event);
};
}
- private static String getAppKilledDiagnostics() {
- return "Application killed by user.";
- }
-
private static class KillAttemptTransition extends RMAppTransition {
@Override
public void transition(RMAppImpl app, RMAppEvent event) {
app.stateBeforeKilling = app.getState();
app.handler.handle(new RMAppAttemptEvent(app.currentAttempt
- .getAppAttemptId(), RMAppAttemptEventType.KILL));
+ .getAppAttemptId(), RMAppAttemptEventType.KILL, event.getDiagnostics()));
}
}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java
index 629b2a3..d4f254e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java
@@ -1270,8 +1270,7 @@ public void transition(RMAppAttemptImpl appAttempt,
appAttempt.invalidateAMHostAndPort();
appEvent =
new RMAppFailedAttemptEvent(applicationId,
- RMAppEventType.ATTEMPT_KILLED,
- "Application killed by user.", false);
+ RMAppEventType.ATTEMPT_KILLED, event.getDiagnostics(), false);
}
break;
case FAILED:
{code}
- random sleep may be flicky, use {{MockRM#waitForState(ApplicationId appId, RMAppState finalState)}} instead
{code}
// Wait for app and attempt to be killed.
Thread.sleep(1000);
{code}
> RM crashes with NPE if leaf queue becomes parent queue during restart
> ---------------------------------------------------------------------
>
> Key: YARN-4000
> URL: https://issues.apache.org/jira/browse/YARN-4000
> Project: Hadoop YARN
> Issue Type: Bug
> Components: capacityscheduler, resourcemanager
> Affects Versions: 2.6.0
> Reporter: Jason Lowe
> Assignee: Varun Saxena
> Attachments: YARN-4000.01.patch, YARN-4000.02.patch, YARN-4000.03.patch
>
>
> This is a similar situation to YARN-2308. If an application is active in queue A and then the RM restarts with a changed capacity scheduler configuration where queue A becomes a parent queue to other subqueues then the RM will crash with a NullPointerException.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)