You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ha...@apache.org on 2018/05/30 21:02:17 UTC

[50/50] [abbrv] hadoop git commit: YARN-8362. Bugfix logic in container retries in node manager. Contributed by Chandni Singh

YARN-8362.  Bugfix logic in container retries in node manager.
            Contributed by Chandni Singh


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/c6bdbf17
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/c6bdbf17
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/c6bdbf17

Branch: refs/heads/HDDS-48
Commit: c6bdbf176c1856f578bbc2cc38a51cef5307e423
Parents: 7c64d5d
Author: Eric Yang <ey...@apache.org>
Authored: Tue May 29 16:56:58 2018 -0400
Committer: Hanisha Koneru <ha...@apache.org>
Committed: Wed May 30 14:00:26 2018 -0700

----------------------------------------------------------------------
 .../container/ContainerImpl.java                |  4 +-
 .../container/SlidingWindowRetryPolicy.java     | 62 +++++++++++---------
 .../container/TestSlidingWindowRetryPolicy.java |  6 ++
 3 files changed, 44 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/c6bdbf17/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
index c09c7f1..5527ac4 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
@@ -1602,8 +1602,10 @@ public class ContainerImpl implements Container {
         }
         container.addDiagnostics(exitEvent.getDiagnosticInfo() + "\n");
       }
-
       if (container.shouldRetry(container.exitCode)) {
+        // Updates to the retry context should  be protected from concurrent
+        // writes. It should only be called from this transition.
+        container.retryPolicy.updateRetryContext(container.windowRetryContext);
         container.storeRetryContext();
         doRelaunch(container,
             container.windowRetryContext.getRemainingRetries(),

http://git-wip-us.apache.org/repos/asf/hadoop/blob/c6bdbf17/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java
index 0208879..36a8b91 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java
@@ -42,49 +42,40 @@ public class SlidingWindowRetryPolicy {
 
   public boolean shouldRetry(RetryContext retryContext,
       int errorCode) {
-    ContainerRetryContext containerRC = retryContext
-        .containerRetryContext;
+    ContainerRetryContext containerRC = retryContext.containerRetryContext;
     Preconditions.checkNotNull(containerRC, "container retry context null");
     ContainerRetryPolicy retryPolicy = containerRC.getRetryPolicy();
     if (retryPolicy == ContainerRetryPolicy.RETRY_ON_ALL_ERRORS
         || (retryPolicy == ContainerRetryPolicy.RETRY_ON_SPECIFIC_ERROR_CODES
         && containerRC.getErrorCodes() != null
         && containerRC.getErrorCodes().contains(errorCode))) {
-      if (containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER) {
-        return true;
-      }
-      int pendingRetries = calculatePendingRetries(retryContext);
-      updateRetryContext(retryContext, pendingRetries);
-      return pendingRetries > 0;
+      return containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER
+          || calculateRemainingRetries(retryContext) > 0;
     }
     return false;
   }
 
   /**
-   * Calculates the pending number of retries.
-   * <p>
-   * When failuresValidityInterval is > 0, it also removes time entries from
-   * <code>restartTimes</code> which are outside the validity interval.
+   * Calculates the remaining number of retries.
    *
-   * @return the pending retries.
+   * @return the remaining retries.
    */
-  private int calculatePendingRetries(RetryContext retryContext) {
+  private int calculateRemainingRetries(RetryContext retryContext) {
     ContainerRetryContext containerRC =
         retryContext.containerRetryContext;
     if (containerRC.getFailuresValidityInterval() > 0) {
-      Iterator<Long> iterator = retryContext.getRestartTimes().iterator();
+      int validFailuresCount = 0;
       long currentTime = clock.getTime();
-      while (iterator.hasNext()) {
-        long restartTime = iterator.next();
+      for (int i = retryContext.restartTimes.size() - 1; i >= 0; i--) {
+        long restartTime = retryContext.restartTimes.get(i);
         if (currentTime - restartTime
-            > containerRC.getFailuresValidityInterval()) {
-          iterator.remove();
+            <= containerRC.getFailuresValidityInterval()) {
+          validFailuresCount++;
         } else {
           break;
         }
       }
-      return containerRC.getMaxRetries() -
-          retryContext.getRestartTimes().size();
+      return containerRC.getMaxRetries() - validFailuresCount;
     } else {
       return retryContext.getRemainingRetries();
     }
@@ -93,13 +84,30 @@ public class SlidingWindowRetryPolicy {
   /**
    * Updates remaining retries and the restart time when
    * required in the retryContext.
+   * <p>
+   * When failuresValidityInterval is > 0, it also removes time entries from
+   * <code>restartTimes</code> which are outside the validity interval.
    */
-  private void updateRetryContext(RetryContext retryContext,
-      int pendingRetries) {
-    retryContext.setRemainingRetries(pendingRetries - 1);
-    if (retryContext.containerRetryContext.getFailuresValidityInterval()
-        > 0) {
-      retryContext.getRestartTimes().add(clock.getTime());
+  protected void updateRetryContext(RetryContext retryContext) {
+    if (retryContext.containerRetryContext.getFailuresValidityInterval() > 0) {
+      ContainerRetryContext containerRC = retryContext.containerRetryContext;
+      Iterator<Long> iterator = retryContext.getRestartTimes().iterator();
+      long currentTime = clock.getTime();
+
+      while (iterator.hasNext()) {
+        long restartTime = iterator.next();
+        if (currentTime - restartTime
+            > containerRC.getFailuresValidityInterval()) {
+          iterator.remove();
+        } else {
+          break;
+        }
+      }
+      retryContext.setRemainingRetries(containerRC.getMaxRetries() -
+          retryContext.restartTimes.size());
+      retryContext.getRestartTimes().add(currentTime);
+    } else {
+      retryContext.remainingRetries--;
     }
   }
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/c6bdbf17/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java
index 04889a9..bacf3bb 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java
@@ -64,12 +64,18 @@ public class TestSlidingWindowRetryPolicy {
         new SlidingWindowRetryPolicy.RetryContext(retryContext);
     Assert.assertTrue("retry 1",
         retryPolicy.shouldRetry(windowRetryContext, 12));
+    retryPolicy.updateRetryContext(windowRetryContext);
+
     clock.setTime(20);
     Assert.assertTrue("retry 2",
         retryPolicy.shouldRetry(windowRetryContext, 12));
+    retryPolicy.updateRetryContext(windowRetryContext);
+
     clock.setTime(40);
     Assert.assertTrue("retry 3",
         retryPolicy.shouldRetry(windowRetryContext, 12));
+    retryPolicy.updateRetryContext(windowRetryContext);
+
     clock.setTime(45);
     Assert.assertFalse("retry failed",
         retryPolicy.shouldRetry(windowRetryContext, 12));


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org