You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by jl...@apache.org on 2014/11/14 22:28:54 UTC
hadoop git commit: YARN-2816. NM fail to start with NPE during
container recovery. Contributed by Zhihai Xu (cherry picked from commit
49c38898b0be64fc686d039ed2fb2dea1378df02)
Repository: hadoop
Updated Branches:
refs/heads/branch-2 5b90e428a -> ad140d1fc
YARN-2816. NM fail to start with NPE during container recovery. Contributed by Zhihai Xu
(cherry picked from commit 49c38898b0be64fc686d039ed2fb2dea1378df02)
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/ad140d1f
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/ad140d1f
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/ad140d1f
Branch: refs/heads/branch-2
Commit: ad140d1fc831735fb9335e27b38d2fc040847af1
Parents: 5b90e42
Author: Jason Lowe <jl...@apache.org>
Authored: Fri Nov 14 21:25:59 2014 +0000
Committer: Jason Lowe <jl...@apache.org>
Committed: Fri Nov 14 21:27:16 2014 +0000
----------------------------------------------------------------------
hadoop-yarn-project/CHANGES.txt | 3 +++
.../recovery/NMLeveldbStateStoreService.java | 24 +++++++++++++++++++-
.../TestNMLeveldbStateStoreService.java | 7 ++++++
3 files changed, 33 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hadoop/blob/ad140d1f/hadoop-yarn-project/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index 2d83526..0f3d31e 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -63,6 +63,9 @@ Release 2.7.0 - UNRELEASED
YARN-2857. ConcurrentModificationException in ContainerLogAppender
(Mohammad Kamrul Islam via jlowe)
+ YARN-2816. NM fail to start with NPE during container recovery (Zhihai Xu
+ via jlowe)
+
Release 2.6.0 - 2014-11-18
INCOMPATIBLE CHANGES
http://git-wip-us.apache.org/repos/asf/hadoop/blob/ad140d1f/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
index 7cf4921..9d54688 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
@@ -146,6 +146,8 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
throws IOException {
ArrayList<RecoveredContainerState> containers =
new ArrayList<RecoveredContainerState>();
+ ArrayList<ContainerId> containersToRemove =
+ new ArrayList<ContainerId>();
LeveldbIterator iter = null;
try {
iter = new LeveldbIterator(db);
@@ -165,7 +167,14 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
ContainerId containerId = ConverterUtils.toContainerId(
key.substring(CONTAINERS_KEY_PREFIX.length(), idEndPos));
String keyPrefix = key.substring(0, idEndPos+1);
- containers.add(loadContainerState(containerId, iter, keyPrefix));
+ RecoveredContainerState rcs = loadContainerState(containerId,
+ iter, keyPrefix);
+ // Don't load container without StartContainerRequest
+ if (rcs.startRequest != null) {
+ containers.add(rcs);
+ } else {
+ containersToRemove.add(containerId);
+ }
}
} catch (DBException e) {
throw new IOException(e);
@@ -175,6 +184,19 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
}
}
+ // remove container without StartContainerRequest
+ for (ContainerId containerId : containersToRemove) {
+ LOG.warn("Remove container " + containerId +
+ " with incomplete records");
+ try {
+ removeContainer(containerId);
+ // TODO: kill and cleanup the leaked container
+ } catch (IOException e) {
+ LOG.error("Unable to remove container " + containerId +
+ " in store", e);
+ }
+ }
+
return containers;
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/ad140d1f/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
index 438cec3..f7f43cc 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
@@ -274,6 +274,13 @@ public class TestNMLeveldbStateStoreService {
assertEquals(containerReq, rcs.getStartRequest());
assertTrue(rcs.getDiagnostics().isEmpty());
+ // store a new container record without StartContainerRequest
+ ContainerId containerId1 = ContainerId.newContainerId(appAttemptId, 6);
+ stateStore.storeContainerLaunched(containerId1);
+ recoveredContainers = stateStore.loadContainersState();
+ // check whether the new container record is discarded
+ assertEquals(1, recoveredContainers.size());
+
// launch the container, add some diagnostics, and verify recovered
StringBuilder diags = new StringBuilder();
stateStore.storeContainerLaunched(containerId);