You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ha...@apache.org on 2018/05/30 21:12:30 UTC
[40/50] [abbrv] hadoop git commit: YARN-8362. Bugfix logic in
container retries in node manager. Contributed by Chandni Singh
YARN-8362. Bugfix logic in container retries in node manager.
Contributed by Chandni Singh
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/135941e0
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/135941e0
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/135941e0
Branch: refs/heads/HDDS-48
Commit: 135941e00d762a417c3b4cc524cdc59b0d1810b1
Parents: 2416906
Author: Eric Yang <ey...@apache.org>
Authored: Tue May 29 16:56:58 2018 -0400
Committer: Eric Yang <ey...@apache.org>
Committed: Tue May 29 16:56:58 2018 -0400
----------------------------------------------------------------------
.../container/ContainerImpl.java | 4 +-
.../container/SlidingWindowRetryPolicy.java | 62 +++++++++++---------
.../container/TestSlidingWindowRetryPolicy.java | 6 ++
3 files changed, 44 insertions(+), 28 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hadoop/blob/135941e0/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
index c09c7f1..5527ac4 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
@@ -1602,8 +1602,10 @@ public class ContainerImpl implements Container {
}
container.addDiagnostics(exitEvent.getDiagnosticInfo() + "\n");
}
-
if (container.shouldRetry(container.exitCode)) {
+ // Updates to the retry context should be protected from concurrent
+ // writes. It should only be called from this transition.
+ container.retryPolicy.updateRetryContext(container.windowRetryContext);
container.storeRetryContext();
doRelaunch(container,
container.windowRetryContext.getRemainingRetries(),
http://git-wip-us.apache.org/repos/asf/hadoop/blob/135941e0/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java
index 0208879..36a8b91 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java
@@ -42,49 +42,40 @@ public class SlidingWindowRetryPolicy {
public boolean shouldRetry(RetryContext retryContext,
int errorCode) {
- ContainerRetryContext containerRC = retryContext
- .containerRetryContext;
+ ContainerRetryContext containerRC = retryContext.containerRetryContext;
Preconditions.checkNotNull(containerRC, "container retry context null");
ContainerRetryPolicy retryPolicy = containerRC.getRetryPolicy();
if (retryPolicy == ContainerRetryPolicy.RETRY_ON_ALL_ERRORS
|| (retryPolicy == ContainerRetryPolicy.RETRY_ON_SPECIFIC_ERROR_CODES
&& containerRC.getErrorCodes() != null
&& containerRC.getErrorCodes().contains(errorCode))) {
- if (containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER) {
- return true;
- }
- int pendingRetries = calculatePendingRetries(retryContext);
- updateRetryContext(retryContext, pendingRetries);
- return pendingRetries > 0;
+ return containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER
+ || calculateRemainingRetries(retryContext) > 0;
}
return false;
}
/**
- * Calculates the pending number of retries.
- * <p>
- * When failuresValidityInterval is > 0, it also removes time entries from
- * <code>restartTimes</code> which are outside the validity interval.
+ * Calculates the remaining number of retries.
*
- * @return the pending retries.
+ * @return the remaining retries.
*/
- private int calculatePendingRetries(RetryContext retryContext) {
+ private int calculateRemainingRetries(RetryContext retryContext) {
ContainerRetryContext containerRC =
retryContext.containerRetryContext;
if (containerRC.getFailuresValidityInterval() > 0) {
- Iterator<Long> iterator = retryContext.getRestartTimes().iterator();
+ int validFailuresCount = 0;
long currentTime = clock.getTime();
- while (iterator.hasNext()) {
- long restartTime = iterator.next();
+ for (int i = retryContext.restartTimes.size() - 1; i >= 0; i--) {
+ long restartTime = retryContext.restartTimes.get(i);
if (currentTime - restartTime
- > containerRC.getFailuresValidityInterval()) {
- iterator.remove();
+ <= containerRC.getFailuresValidityInterval()) {
+ validFailuresCount++;
} else {
break;
}
}
- return containerRC.getMaxRetries() -
- retryContext.getRestartTimes().size();
+ return containerRC.getMaxRetries() - validFailuresCount;
} else {
return retryContext.getRemainingRetries();
}
@@ -93,13 +84,30 @@ public class SlidingWindowRetryPolicy {
/**
* Updates remaining retries and the restart time when
* required in the retryContext.
+ * <p>
+ * When failuresValidityInterval is > 0, it also removes time entries from
+ * <code>restartTimes</code> which are outside the validity interval.
*/
- private void updateRetryContext(RetryContext retryContext,
- int pendingRetries) {
- retryContext.setRemainingRetries(pendingRetries - 1);
- if (retryContext.containerRetryContext.getFailuresValidityInterval()
- > 0) {
- retryContext.getRestartTimes().add(clock.getTime());
+ protected void updateRetryContext(RetryContext retryContext) {
+ if (retryContext.containerRetryContext.getFailuresValidityInterval() > 0) {
+ ContainerRetryContext containerRC = retryContext.containerRetryContext;
+ Iterator<Long> iterator = retryContext.getRestartTimes().iterator();
+ long currentTime = clock.getTime();
+
+ while (iterator.hasNext()) {
+ long restartTime = iterator.next();
+ if (currentTime - restartTime
+ > containerRC.getFailuresValidityInterval()) {
+ iterator.remove();
+ } else {
+ break;
+ }
+ }
+ retryContext.setRemainingRetries(containerRC.getMaxRetries() -
+ retryContext.restartTimes.size());
+ retryContext.getRestartTimes().add(currentTime);
+ } else {
+ retryContext.remainingRetries--;
}
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/135941e0/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java
index 04889a9..bacf3bb 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java
@@ -64,12 +64,18 @@ public class TestSlidingWindowRetryPolicy {
new SlidingWindowRetryPolicy.RetryContext(retryContext);
Assert.assertTrue("retry 1",
retryPolicy.shouldRetry(windowRetryContext, 12));
+ retryPolicy.updateRetryContext(windowRetryContext);
+
clock.setTime(20);
Assert.assertTrue("retry 2",
retryPolicy.shouldRetry(windowRetryContext, 12));
+ retryPolicy.updateRetryContext(windowRetryContext);
+
clock.setTime(40);
Assert.assertTrue("retry 3",
retryPolicy.shouldRetry(windowRetryContext, 12));
+ retryPolicy.updateRetryContext(windowRetryContext);
+
clock.setTime(45);
Assert.assertFalse("retry failed",
retryPolicy.shouldRetry(windowRetryContext, 12));
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org