You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by zh...@apache.org on 2014/12/24 20:35:38 UTC

[24/50] hadoop git commit: YARN-2675. containersKilled metrics is not updated when the container is killed during localization. (Zhihai Xu via kasha)

YARN-2675. containersKilled metrics is not updated when the container is killed during localization. (Zhihai Xu via kasha)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/838c213b
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/838c213b
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/838c213b

Branch: refs/heads/HDFS-EC
Commit: 838c213bbec3db15e384465402d38cb2d0d7724a
Parents: 89c7bd8
Author: Karthik Kambatla <ka...@apache.org>
Authored: Fri Dec 19 16:00:24 2014 -0800
Committer: Zhe Zhang <zh...@cloudera.com>
Committed: Wed Dec 24 11:22:16 2014 -0800

----------------------------------------------------------------------
 hadoop-yarn-project/CHANGES.txt                 |   3 +
 .../container/ContainerImpl.java                | 164 +++++++++++++------
 .../nodemanager/metrics/NodeManagerMetrics.java |  16 ++
 .../container/TestContainer.java                |  27 +++
 4 files changed, 159 insertions(+), 51 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/838c213b/hadoop-yarn-project/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index 54ca3d2..7287a24 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -250,6 +250,9 @@ Release 2.7.0 - UNRELEASED
     YARN-2964. RM prematurely cancels tokens for jobs that submit jobs (oozie)
     (Jian He via jlowe)
 
+    YARN-2675. containersKilled metrics is not updated when the container is killed 
+    during localization. (Zhihai Xu via kasha)
+
 Release 2.6.0 - 2014-11-18
 
   INCOMPATIBLE CHANGES

http://git-wip-us.apache.org/repos/asf/hadoop/blob/838c213b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
index 58b1e22..f0c506d 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
@@ -159,9 +159,6 @@ public class ContainerImpl implements Container {
     this.diagnostics.append(diagnostics);
   }
 
-  private static final ContainerDoneTransition CONTAINER_DONE_TRANSITION =
-    new ContainerDoneTransition();
-
   private static final ContainerDiagnosticsUpdateTransition UPDATE_DIAGNOSTICS_TRANSITION =
       new ContainerDiagnosticsUpdateTransition();
 
@@ -202,7 +199,7 @@ public class ContainerImpl implements Container {
     .addTransition(ContainerState.LOCALIZATION_FAILED,
         ContainerState.DONE,
         ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP,
-        CONTAINER_DONE_TRANSITION)
+        new LocalizationFailedToDoneTransition())
     .addTransition(ContainerState.LOCALIZATION_FAILED,
         ContainerState.LOCALIZATION_FAILED,
         ContainerEventType.UPDATE_DIAGNOSTICS_MSG,
@@ -254,7 +251,7 @@ public class ContainerImpl implements Container {
     // From CONTAINER_EXITED_WITH_SUCCESS State
     .addTransition(ContainerState.EXITED_WITH_SUCCESS, ContainerState.DONE,
         ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP,
-        CONTAINER_DONE_TRANSITION)
+        new ExitedWithSuccessToDoneTransition())
     .addTransition(ContainerState.EXITED_WITH_SUCCESS,
         ContainerState.EXITED_WITH_SUCCESS,
         ContainerEventType.UPDATE_DIAGNOSTICS_MSG,
@@ -266,7 +263,7 @@ public class ContainerImpl implements Container {
     // From EXITED_WITH_FAILURE State
     .addTransition(ContainerState.EXITED_WITH_FAILURE, ContainerState.DONE,
             ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP,
-            CONTAINER_DONE_TRANSITION)
+            new ExitedWithFailureToDoneTransition())
     .addTransition(ContainerState.EXITED_WITH_FAILURE,
         ContainerState.EXITED_WITH_FAILURE,
         ContainerEventType.UPDATE_DIAGNOSTICS_MSG,
@@ -301,7 +298,7 @@ public class ContainerImpl implements Container {
     .addTransition(ContainerState.KILLING,
             ContainerState.DONE,
             ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP,
-            CONTAINER_DONE_TRANSITION)
+            new KillingToDoneTransition())
     // Handle a launched container during killing stage is a no-op
     // as cleanup container is always handled after launch container event
     // in the container launcher
@@ -313,7 +310,7 @@ public class ContainerImpl implements Container {
     .addTransition(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL,
             ContainerState.DONE,
             ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP,
-            CONTAINER_DONE_TRANSITION)
+            new ContainerCleanedupAfterKillToDoneTransition())
     .addTransition(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL,
         ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL,
         ContainerEventType.UPDATE_DIAGNOSTICS_MSG,
@@ -463,47 +460,6 @@ public class ContainerImpl implements Container {
     }
   }
 
-  @SuppressWarnings("fallthrough")
-  private void finished() {
-    ApplicationId applicationId =
-        containerId.getApplicationAttemptId().getApplicationId();
-    switch (getContainerState()) {
-      case EXITED_WITH_SUCCESS:
-        metrics.endRunningContainer();
-        metrics.completedContainer();
-        NMAuditLogger.logSuccess(user,
-            AuditConstants.FINISH_SUCCESS_CONTAINER, "ContainerImpl",
-            applicationId, containerId);
-        break;
-      case EXITED_WITH_FAILURE:
-        if (wasLaunched) {
-          metrics.endRunningContainer();
-        }
-        // fall through
-      case LOCALIZATION_FAILED:
-        metrics.failedContainer();
-        NMAuditLogger.logFailure(user,
-            AuditConstants.FINISH_FAILED_CONTAINER, "ContainerImpl",
-            "Container failed with state: " + getContainerState(),
-            applicationId, containerId);
-        break;
-      case CONTAINER_CLEANEDUP_AFTER_KILL:
-        if (wasLaunched) {
-          metrics.endRunningContainer();
-        }
-        // fall through
-      case NEW:
-        metrics.killedContainer();
-        NMAuditLogger.logSuccess(user,
-            AuditConstants.FINISH_KILLED_CONTAINER, "ContainerImpl",
-            applicationId,
-            containerId);
-    }
-
-    metrics.releaseContainer(this.resource);
-    sendFinishedEvents();
-  }
-
   @SuppressWarnings("unchecked")
   private void sendFinishedEvents() {
     // Inform the application
@@ -611,7 +567,13 @@ public class ContainerImpl implements Container {
       } else if (container.recoveredAsKilled &&
           container.recoveredStatus == RecoveredContainerStatus.REQUESTED) {
         // container was killed but never launched
-        container.finished();
+        container.metrics.killedContainer();
+        NMAuditLogger.logSuccess(container.user,
+            AuditConstants.FINISH_KILLED_CONTAINER, "ContainerImpl",
+            container.containerId.getApplicationAttemptId().getApplicationId(),
+            container.containerId);
+        container.metrics.releaseContainer(container.resource);
+        container.sendFinishedEvents();
         return ContainerState.DONE;
       }
 
@@ -994,7 +956,8 @@ public class ContainerImpl implements Container {
     @Override
     @SuppressWarnings("unchecked")
     public void transition(ContainerImpl container, ContainerEvent event) {
-      container.finished();
+      container.metrics.releaseContainer(container.resource);
+      container.sendFinishedEvents();
       //if the current state is NEW it means the CONTAINER_INIT was never 
       // sent for the event, thus no need to send the CONTAINER_STOP
       if (container.getCurrentState() 
@@ -1016,6 +979,105 @@ public class ContainerImpl implements Container {
       container.exitCode = killEvent.getContainerExitStatus();
       container.addDiagnostics(killEvent.getDiagnostic(), "\n");
       container.addDiagnostics("Container is killed before being launched.\n");
+      container.metrics.killedContainer();
+      NMAuditLogger.logSuccess(container.user,
+          AuditConstants.FINISH_KILLED_CONTAINER, "ContainerImpl",
+          container.containerId.getApplicationAttemptId().getApplicationId(),
+          container.containerId);
+      super.transition(container, event);
+    }
+  }
+
+  /**
+   * Handle the following transition:
+   * - LOCALIZATION_FAILED -> DONE upon CONTAINER_RESOURCES_CLEANEDUP
+   */
+  static class LocalizationFailedToDoneTransition extends
+      ContainerDoneTransition {
+    @Override
+    public void transition(ContainerImpl container, ContainerEvent event) {
+      container.metrics.failedContainer();
+      NMAuditLogger.logFailure(container.user,
+          AuditConstants.FINISH_FAILED_CONTAINER, "ContainerImpl",
+          "Container failed with state: " + container.getContainerState(),
+          container.containerId.getApplicationAttemptId().getApplicationId(),
+          container.containerId);
+      super.transition(container, event);
+    }
+  }
+
+  /**
+   * Handle the following transition:
+   * - EXITED_WITH_SUCCESS -> DONE upon CONTAINER_RESOURCES_CLEANEDUP
+   */
+  static class ExitedWithSuccessToDoneTransition extends
+      ContainerDoneTransition {
+    @Override
+    public void transition(ContainerImpl container, ContainerEvent event) {
+      container.metrics.endRunningContainer();
+      container.metrics.completedContainer();
+      NMAuditLogger.logSuccess(container.user,
+          AuditConstants.FINISH_SUCCESS_CONTAINER, "ContainerImpl",
+          container.containerId.getApplicationAttemptId().getApplicationId(),
+          container.containerId);
+      super.transition(container, event);
+    }
+  }
+
+  /**
+   * Handle the following transition:
+   * - EXITED_WITH_FAILURE -> DONE upon CONTAINER_RESOURCES_CLEANEDUP
+   */
+  static class ExitedWithFailureToDoneTransition extends
+      ContainerDoneTransition {
+    @Override
+    public void transition(ContainerImpl container, ContainerEvent event) {
+      if (container.wasLaunched) {
+        container.metrics.endRunningContainer();
+      }
+      container.metrics.failedContainer();
+      NMAuditLogger.logFailure(container.user,
+          AuditConstants.FINISH_FAILED_CONTAINER, "ContainerImpl",
+          "Container failed with state: " + container.getContainerState(),
+          container.containerId.getApplicationAttemptId().getApplicationId(),
+          container.containerId);
+      super.transition(container, event);
+    }
+  }
+
+  /**
+   * Handle the following transition:
+   * - KILLING -> DONE upon CONTAINER_RESOURCES_CLEANEDUP
+   */
+  static class KillingToDoneTransition extends
+      ContainerDoneTransition {
+    @Override
+    public void transition(ContainerImpl container, ContainerEvent event) {
+      container.metrics.killedContainer();
+      NMAuditLogger.logSuccess(container.user,
+          AuditConstants.FINISH_KILLED_CONTAINER, "ContainerImpl",
+          container.containerId.getApplicationAttemptId().getApplicationId(),
+          container.containerId);
+      super.transition(container, event);
+    }
+  }
+
+  /**
+   * Handle the following transition:
+   * CONTAINER_CLEANEDUP_AFTER_KILL -> DONE upon CONTAINER_RESOURCES_CLEANEDUP
+   */
+  static class ContainerCleanedupAfterKillToDoneTransition extends
+      ContainerDoneTransition {
+    @Override
+    public void transition(ContainerImpl container, ContainerEvent event) {
+      if (container.wasLaunched) {
+        container.metrics.endRunningContainer();
+      }
+      container.metrics.killedContainer();
+      NMAuditLogger.logSuccess(container.user,
+          AuditConstants.FINISH_KILLED_CONTAINER, "ContainerImpl",
+          container.containerId.getApplicationAttemptId().getApplicationId(),
+          container.containerId);
       super.transition(container, event);
     }
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/838c213b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java
index beaafe1..3615fee 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java
@@ -27,6 +27,8 @@ import org.apache.hadoop.metrics2.lib.MutableRate;
 import org.apache.hadoop.metrics2.source.JvmMetrics;
 import org.apache.hadoop.yarn.api.records.Resource;
 
+import com.google.common.annotations.VisibleForTesting;
+
 @Metrics(about="Metrics for node manager", context="yarn")
 public class NodeManagerMetrics {
   @Metric MutableCounterInt containersLaunched;
@@ -127,4 +129,18 @@ public class NodeManagerMetrics {
     return containersRunning.value();
   }
 
+  @VisibleForTesting
+  public int getKilledContainers() {
+    return containersKilled.value();
+  }
+
+  @VisibleForTesting
+  public int getFailedContainers() {
+    return containersFailed.value();
+  }
+
+  @VisibleForTesting
+  public int getCompletedContainers() {
+    return containersCompleted.value();
+  }
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/838c213b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java
index c28d691..2834e30 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java
@@ -173,13 +173,20 @@ public class TestContainer {
       wc = new WrappedContainer(13, 314159265358979L, 4344, "yak");
       wc.initContainer();
       wc.localizeResources();
+      int running = metrics.getRunningContainers();
       wc.launchContainer();
+      assertEquals(running + 1, metrics.getRunningContainers());
       reset(wc.localizerBus);
       wc.containerKilledOnRequest();
       assertEquals(ContainerState.EXITED_WITH_FAILURE, 
           wc.c.getContainerState());
       assertNull(wc.c.getLocalizedResources());
       verifyCleanupCall(wc);
+      int failed = metrics.getFailedContainers();
+      wc.containerResourcesCleanup();
+      assertEquals(ContainerState.DONE, wc.c.getContainerState());
+      assertEquals(failed + 1, metrics.getFailedContainers());
+      assertEquals(running, metrics.getRunningContainers());
     }
     finally {
       if (wc != null) {
@@ -219,13 +226,20 @@ public class TestContainer {
       wc = new WrappedContainer(11, 314159265358979L, 4344, "yak");
       wc.initContainer();
       wc.localizeResources();
+      int running = metrics.getRunningContainers();
       wc.launchContainer();
+      assertEquals(running + 1, metrics.getRunningContainers());
       reset(wc.localizerBus);
       wc.containerSuccessful();
       assertEquals(ContainerState.EXITED_WITH_SUCCESS,
           wc.c.getContainerState());
       assertNull(wc.c.getLocalizedResources());
       verifyCleanupCall(wc);
+      int completed = metrics.getCompletedContainers();
+      wc.containerResourcesCleanup();
+      assertEquals(ContainerState.DONE, wc.c.getContainerState());
+      assertEquals(completed + 1, metrics.getCompletedContainers());
+      assertEquals(running, metrics.getRunningContainers());
     }
     finally {
       if (wc != null) {
@@ -319,12 +333,14 @@ public class TestContainer {
     try {
       wc = new WrappedContainer(13, 314159265358979L, 4344, "yak");
       assertEquals(ContainerState.NEW, wc.c.getContainerState());
+      int killed = metrics.getKilledContainers();
       wc.killContainer();
       assertEquals(ContainerState.DONE, wc.c.getContainerState());
       assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
           wc.c.cloneAndGetContainerStatus().getExitStatus());
       assertTrue(wc.c.cloneAndGetContainerStatus().getDiagnostics()
           .contains("KillRequest"));
+      assertEquals(killed + 1, metrics.getKilledContainers());
     } finally {
       if (wc != null) {
         wc.finished();
@@ -345,6 +361,10 @@ public class TestContainer {
           wc.c.cloneAndGetContainerStatus().getExitStatus());
       assertTrue(wc.c.cloneAndGetContainerStatus().getDiagnostics()
           .contains("KillRequest"));
+      int killed = metrics.getKilledContainers();
+      wc.containerResourcesCleanup();
+      assertEquals(ContainerState.DONE, wc.c.getContainerState());
+      assertEquals(killed + 1, metrics.getKilledContainers());
     } finally {
       if (wc != null) {
         wc.finished();
@@ -365,6 +385,10 @@ public class TestContainer {
       assertEquals(ContainerState.LOCALIZATION_FAILED, wc.c.getContainerState());
       assertNull(wc.c.getLocalizedResources());
       verifyCleanupCall(wc);
+      int failed = metrics.getFailedContainers();
+      wc.containerResourcesCleanup();
+      assertEquals(ContainerState.DONE, wc.c.getContainerState());
+      assertEquals(failed + 1, metrics.getFailedContainers());
     } finally {
       if (wc != null) {
         wc.finished();
@@ -389,8 +413,11 @@ public class TestContainer {
           wc.c.getContainerState());
       assertNull(wc.c.getLocalizedResources());
       verifyCleanupCall(wc);
+      int killed = metrics.getKilledContainers();
       wc.c.handle(new ContainerEvent(wc.c.getContainerId(),
           ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP));
+      assertEquals(ContainerState.DONE, wc.c.getContainerState());
+      assertEquals(killed + 1, metrics.getKilledContainers());
       assertEquals(0, metrics.getRunningContainers());
     } finally {
       if (wc != null) {