You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by jl...@apache.org on 2017/08/25 14:20:10 UTC
hadoop git commit: YARN-6640. AM heartbeat stuck when responseId
overflows MAX_INT. Contributed by Botong Huang
Repository: hadoop
Updated Branches:
refs/heads/trunk 9e2699ac2 -> 3a4e86116
YARN-6640. AM heartbeat stuck when responseId overflows MAX_INT. Contributed by Botong Huang
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/3a4e8611
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/3a4e8611
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/3a4e8611
Branch: refs/heads/trunk
Commit: 3a4e861169dc3da9df0158ba6f44a9bc8576e217
Parents: 9e2699a
Author: Jason Lowe <jl...@apache.org>
Authored: Fri Aug 25 09:16:17 2017 -0500
Committer: Jason Lowe <jl...@apache.org>
Committed: Fri Aug 25 09:16:17 2017 -0500
----------------------------------------------------------------------
.../ApplicationMasterService.java | 33 +++++++++++++----
.../yarn/server/resourcemanager/MockAM.java | 31 +++++++++++-----
.../TestApplicationMasterService.java | 37 +++++++++++++++++++-
3 files changed, 86 insertions(+), 15 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hadoop/blob/3a4e8611/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java
index f77d09e..0b13887 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java
@@ -81,6 +81,8 @@ import com.google.common.annotations.VisibleForTesting;
public class ApplicationMasterService extends AbstractService implements
ApplicationMasterProtocol {
private static final Log LOG = LogFactory.getLog(ApplicationMasterService.class);
+ private static final int PRE_REGISTER_RESPONSE_ID = -1;
+
private final AMLivelinessMonitor amLivelinessMonitor;
private YarnScheduler rScheduler;
protected InetSocketAddress masterServiceAddress;
@@ -325,6 +327,11 @@ public class ApplicationMasterService extends AbstractService implements
protected static final Allocation EMPTY_ALLOCATION = new Allocation(
EMPTY_CONTAINER_LIST, Resources.createResource(0), null, null, null);
+ private int getNextResponseId(int responseId) {
+ // Loop between 0 to Integer.MAX_VALUE
+ return (responseId + 1) & Integer.MAX_VALUE;
+ }
+
@Override
public AllocateResponse allocate(AllocateRequest request)
throws YarnException, IOException {
@@ -357,14 +364,17 @@ public class ApplicationMasterService extends AbstractService implements
throw new ApplicationMasterNotRegisteredException(message);
}
- if ((request.getResponseId() + 1) == lastResponse.getResponseId()) {
- /* old heartbeat */
+ // Normally request.getResponseId() == lastResponse.getResponseId()
+ if (getNextResponseId(request.getResponseId()) == lastResponse
+ .getResponseId()) {
+ // heartbeat one step old, simply return lastReponse
return lastResponse;
- } else if (request.getResponseId() + 1 < lastResponse.getResponseId()) {
+ } else if (request.getResponseId() != lastResponse.getResponseId()) {
String message =
"Invalid responseId in AllocateRequest from application attempt: "
+ appAttemptId + ", expect responseId to be "
- + (lastResponse.getResponseId() + 1);
+ + lastResponse.getResponseId() + ", but get "
+ + request.getResponseId();
throw new InvalidApplicationMasterRequestException(message);
}
@@ -404,7 +414,7 @@ public class ApplicationMasterService extends AbstractService implements
* need to worry about unregister call occurring in between (which
* removes the lock object).
*/
- response.setResponseId(lastResponse.getResponseId() + 1);
+ response.setResponseId(getNextResponseId(lastResponse.getResponseId()));
lock.setAllocateResponse(response);
return response;
}
@@ -415,12 +425,23 @@ public class ApplicationMasterService extends AbstractService implements
recordFactory.newRecordInstance(AllocateResponse.class);
// set response id to -1 before application master for the following
// attemptID get registered
- response.setResponseId(-1);
+ response.setResponseId(PRE_REGISTER_RESPONSE_ID);
LOG.info("Registering app attempt : " + attemptId);
responseMap.put(attemptId, new AllocateResponseLock(response));
rmContext.getNMTokenSecretManager().registerApplicationAttempt(attemptId);
}
+ @VisibleForTesting
+ protected boolean setAttemptLastResponseId(ApplicationAttemptId attemptId,
+ int lastResponseId) {
+ AllocateResponseLock lock = responseMap.get(attemptId);
+ if (lock == null || lock.getAllocateResponse() == null) {
+ return false;
+ }
+ lock.getAllocateResponse().setResponseId(lastResponseId);
+ return true;
+ }
+
public void unregisterAttempt(ApplicationAttemptId attemptId) {
LOG.info("Unregistering app attempt : " + attemptId);
responseMap.remove(attemptId);
http://git-wip-us.apache.org/repos/asf/hadoop/blob/3a4e8611/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockAM.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockAM.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockAM.java
index 2451c1e..a698ecf 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockAM.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockAM.java
@@ -126,6 +126,18 @@ public class MockAM {
}
}
+ public boolean setApplicationLastResponseId(int newLastResponseId) {
+ ApplicationMasterService applicationMasterService =
+ (ApplicationMasterService) amRMProtocol;
+ responseId = newLastResponseId;
+ return applicationMasterService.setAttemptLastResponseId(attemptId,
+ newLastResponseId);
+ }
+
+ public int getResponseId() {
+ return responseId;
+ }
+
public void addRequests(String[] hosts, int memory, int priority,
int containers) throws Exception {
addRequests(hosts, memory, priority, containers, 0L);
@@ -272,19 +284,22 @@ public class MockAM {
public AllocateResponse doAllocateAs(UserGroupInformation ugi,
final AllocateRequest req) throws Exception {
- req.setResponseId(++responseId);
+ req.setResponseId(responseId);
try {
- return ugi.doAs(new PrivilegedExceptionAction<AllocateResponse>() {
- @Override
- public AllocateResponse run() throws Exception {
- return amRMProtocol.allocate(req);
- }
- });
+ AllocateResponse response =
+ ugi.doAs(new PrivilegedExceptionAction<AllocateResponse>() {
+ @Override
+ public AllocateResponse run() throws Exception {
+ return amRMProtocol.allocate(req);
+ }
+ });
+ responseId = response.getResponseId();
+ return response;
} catch (UndeclaredThrowableException e) {
throw (Exception) e.getCause();
}
}
-
+
public AllocateResponse doHeartbeat() throws Exception {
return allocate(null, null);
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/3a4e8611/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestApplicationMasterService.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestApplicationMasterService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestApplicationMasterService.java
index 85a36e7..e684f3c 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestApplicationMasterService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestApplicationMasterService.java
@@ -272,7 +272,42 @@ public class TestApplicationMasterService {
Assert.assertEquals(MockRM.getClusterTimeStamp(), tokenId.getRMIdentifier());
rm.stop();
}
-
+
+ @Test(timeout = 3000000)
+ public void testAllocateResponseIdOverflow() throws Exception {
+ MockRM rm = new MockRM(conf);
+ try {
+ rm.start();
+
+ // Register node1
+ MockNM nm1 = rm.registerNode("127.0.0.1:1234", 6 * GB);
+
+ // Submit an application
+ RMApp app1 = rm.submitApp(2048);
+
+ // kick the scheduling
+ nm1.nodeHeartbeat(true);
+ RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
+ MockAM am1 = rm.sendAMLaunched(attempt1.getAppAttemptId());
+ am1.registerAppAttempt();
+
+ // Set the last reponseId to be MAX_INT
+ Assert.assertTrue(am1.setApplicationLastResponseId(Integer.MAX_VALUE));
+
+ // Both allocate should succeed
+ am1.schedule(); // send allocate with reponseId = MAX_INT
+ Assert.assertEquals(0, am1.getResponseId());
+
+ am1.schedule(); // send allocate with reponseId = 0
+ Assert.assertEquals(1, am1.getResponseId());
+
+ } finally {
+ if (rm != null) {
+ rm.stop();
+ }
+ }
+ }
+
@Test(timeout=600000)
public void testInvalidContainerReleaseRequest() throws Exception {
MockRM rm = new MockRM(conf);
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org