You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by bi...@apache.org on 2018/05/08 20:50:38 UTC

hadoop git commit: YARN-7894. Improve ATS response for DShell DS_CONTAINER when container launch fails. Contributed by Chandni Singh

Repository: hadoop
Updated Branches:
  refs/heads/trunk a2ea75642 -> 1ef0a1db1


YARN-7894. Improve ATS response for DShell DS_CONTAINER when container launch fails. Contributed by Chandni Singh


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/1ef0a1db
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/1ef0a1db
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/1ef0a1db

Branch: refs/heads/trunk
Commit: 1ef0a1db1d6a412b2a26782329a8325635866d0a
Parents: a2ea756
Author: Billie Rinaldi <bi...@apache.org>
Authored: Tue May 8 13:49:41 2018 -0700
Committer: Billie Rinaldi <bi...@apache.org>
Committed: Tue May 8 13:49:41 2018 -0700

----------------------------------------------------------------------
 .../distributedshell/ApplicationMaster.java     | 98 +++++++++++++++++---
 .../distributedshell/TestDistributedShell.java  | 10 ++
 2 files changed, 93 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/1ef0a1db/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java
index cca5676..bc018b1 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java
@@ -117,6 +117,7 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
 import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
+import org.apache.hadoop.yarn.util.BoundedAppender;
 import org.apache.hadoop.yarn.util.SystemClock;
 import org.apache.hadoop.yarn.util.TimelineServiceHelper;
 import org.apache.hadoop.yarn.util.resource.ResourceUtils;
@@ -345,6 +346,7 @@ public class ApplicationMaster {
   static final String CONTAINER_ENTITY_GROUP_ID = "CONTAINERS";
   static final String APPID_TIMELINE_FILTER_NAME = "appId";
   static final String USER_TIMELINE_FILTER_NAME = "user";
+  static final String DIAGNOSTICS = "Diagnostics";
 
   private final String linux_bash_command = "bash";
   private final String windows_command = "cmd /c";
@@ -356,6 +358,8 @@ public class ApplicationMaster {
   protected final Set<ContainerId> launchedContainers =
       Collections.newSetFromMap(new ConcurrentHashMap<ContainerId, Boolean>());
 
+  private BoundedAppender diagnostics = new BoundedAppender(64 * 1024);
+
   /**
    * Container start times used to set id prefix while publishing entity
    * to ATSv2.
@@ -390,7 +394,7 @@ public class ApplicationMaster {
       LOG.info("Application Master completed successfully. exiting");
       System.exit(0);
     } else {
-      LOG.info("Application Master failed. exiting");
+      LOG.error("Application Master failed. exiting");
       System.exit(2);
     }
   }
@@ -931,28 +935,25 @@ public class ApplicationMaster {
     LOG.info("Application completed. Signalling finish to RM");
 
     FinalApplicationStatus appStatus;
-    String appMessage = null;
     boolean success = true;
+    String message = null;
     if (numCompletedContainers.get() - numFailedContainers.get()
         >= numTotalContainers) {
       appStatus = FinalApplicationStatus.SUCCEEDED;
     } else {
       appStatus = FinalApplicationStatus.FAILED;
-      appMessage = "Diagnostics." + ", total=" + numTotalContainers
-          + ", completed=" + numCompletedContainers.get() + ", allocated="
-          + numAllocatedContainers.get() + ", failed="
-          + numFailedContainers.get();
-      LOG.info(appMessage);
+      message = String.format("Application Failure: desired = %d, " +
+              "completed = %d, allocated = %d, failed = %d, " +
+              "diagnostics = %s", numRequestedContainers.get(),
+          numCompletedContainers.get(), numAllocatedContainers.get(),
+          numFailedContainers.get(), diagnostics);
       success = false;
     }
     try {
-      amRMClient.unregisterApplicationMaster(appStatus, appMessage, null);
-    } catch (YarnException ex) {
+      amRMClient.unregisterApplicationMaster(appStatus, message, null);
+    } catch (YarnException | IOException ex) {
       LOG.error("Failed to unregister application", ex);
-    } catch (IOException e) {
-      LOG.error("Failed to unregister application", e);
     }
-    
     amRMClient.stop();
 
     // Stop Timeline Client
@@ -974,11 +975,17 @@ public class ApplicationMaster {
       LOG.info("Got response from RM for container ask, completedCnt="
           + completedContainers.size());
       for (ContainerStatus containerStatus : completedContainers) {
-        LOG.info(appAttemptID + " got container status for containerID="
+        String message = appAttemptID + " got container status for containerID="
             + containerStatus.getContainerId() + ", state="
             + containerStatus.getState() + ", exitStatus="
             + containerStatus.getExitStatus() + ", diagnostics="
-            + containerStatus.getDiagnostics());
+            + containerStatus.getDiagnostics();
+        if (containerStatus.getExitStatus() != 0) {
+          LOG.error(message);
+          diagnostics.append(containerStatus.getDiagnostics());
+        } else {
+          LOG.info(message);
+        }
 
         // non complete containers should not be here
         assert (containerStatus.getState() == ContainerState.COMPLETE);
@@ -1244,10 +1251,17 @@ public class ApplicationMaster {
 
     @Override
     public void onStartContainerError(ContainerId containerId, Throwable t) {
-      LOG.error("Failed to start Container " + containerId, t);
+      LOG.error("Failed to start Container {}", containerId, t);
       containers.remove(containerId);
       applicationMaster.numCompletedContainers.incrementAndGet();
       applicationMaster.numFailedContainers.incrementAndGet();
+      if (timelineServiceV2Enabled) {
+        publishContainerStartFailedEventOnTimelineServiceV2(containerId,
+            t.getMessage());
+      }
+      if (timelineServiceV1Enabled) {
+        publishContainerStartFailedEvent(containerId, t.getMessage());
+      }
     }
 
     @Override
@@ -1525,6 +1539,7 @@ public class ApplicationMaster {
     event.setEventType(DSEvent.DS_CONTAINER_END.toString());
     event.addEventInfo("State", container.getState().name());
     event.addEventInfo("Exit Status", container.getExitStatus());
+    event.addEventInfo(DIAGNOSTICS, container.getDiagnostics());
     entity.addEvent(event);
     try {
       processTimelineResponseErrors(
@@ -1653,6 +1668,58 @@ public class ApplicationMaster {
     }
   }
 
+  private void publishContainerStartFailedEventOnTimelineServiceV2(
+      final ContainerId containerId, String diagnostics) {
+    final org.apache.hadoop.yarn.api.records.timelineservice.TimelineEntity
+        entity = new org.apache.hadoop.yarn.api.records.timelineservice.
+        TimelineEntity();
+    entity.setId(containerId.toString());
+    entity.setType(DSEntity.DS_CONTAINER.toString());
+    entity.addInfo("user", appSubmitterUgi.getShortUserName());
+    org.apache.hadoop.yarn.api.records.timelineservice.TimelineEvent event =
+        new org.apache.hadoop.yarn.api.records.timelineservice
+            .TimelineEvent();
+    event.setTimestamp(System.currentTimeMillis());
+    event.setId(DSEvent.DS_CONTAINER_END.toString());
+    event.addInfo(DIAGNOSTICS, diagnostics);
+    entity.addEvent(event);
+    try {
+      appSubmitterUgi.doAs((PrivilegedExceptionAction<Object>) () -> {
+        timelineV2Client.putEntitiesAsync(entity);
+        return null;
+      });
+    } catch (Exception e) {
+      LOG.error("Container start failed event could not be published for {}",
+          containerId,
+          e instanceof UndeclaredThrowableException ? e.getCause() : e);
+    }
+  }
+
+  private void publishContainerStartFailedEvent(final ContainerId containerId,
+      String diagnostics) {
+    final TimelineEntity entityV1 = new TimelineEntity();
+    entityV1.setEntityId(containerId.toString());
+    entityV1.setEntityType(DSEntity.DS_CONTAINER.toString());
+    entityV1.setDomainId(domainId);
+    entityV1.addPrimaryFilter(USER_TIMELINE_FILTER_NAME, appSubmitterUgi
+        .getShortUserName());
+    entityV1.addPrimaryFilter(APPID_TIMELINE_FILTER_NAME,
+        containerId.getApplicationAttemptId().getApplicationId().toString());
+
+    TimelineEvent eventV1 = new TimelineEvent();
+    eventV1.setTimestamp(System.currentTimeMillis());
+    eventV1.setEventType(DSEvent.DS_CONTAINER_END.toString());
+    eventV1.addEventInfo(DIAGNOSTICS, diagnostics);
+    entityV1.addEvent(eventV1);
+    try {
+      processTimelineResponseErrors(putContainerEntity(timelineClient,
+          containerId.getApplicationAttemptId(), entityV1));
+    } catch (YarnException | IOException | ClientHandlerException e) {
+      LOG.error("Container end event could not be published for {}",
+          containerId, e);
+    }
+  }
+
   private void publishContainerEndEventOnTimelineServiceV2(
       final ContainerStatus container, long containerStartTime) {
     final org.apache.hadoop.yarn.api.records.timelineservice.TimelineEntity
@@ -1669,6 +1736,7 @@ public class ApplicationMaster {
     event.setId(DSEvent.DS_CONTAINER_END.toString());
     event.addInfo("State", container.getState().name());
     event.addInfo("Exit Status", container.getExitStatus());
+    event.addInfo(DIAGNOSTICS, container.getDiagnostics());
     entity.addEvent(event);
     entity.setIdPrefix(TimelineServiceHelper.invertLong(containerStartTime));
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/1ef0a1db/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/test/java/org/apache/hadoop/yarn/applications/distributedshell/TestDistributedShell.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/test/java/org/apache/hadoop/yarn/applications/distributedshell/TestDistributedShell.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/test/java/org/apache/hadoop/yarn/applications/distributedshell/TestDistributedShell.java
index 0e94ad5..3a98a22 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/test/java/org/apache/hadoop/yarn/applications/distributedshell/TestDistributedShell.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/test/java/org/apache/hadoop/yarn/applications/distributedshell/TestDistributedShell.java
@@ -69,6 +69,7 @@ import org.apache.hadoop.yarn.api.records.timeline.TimelineDomain;
 import org.apache.hadoop.yarn.api.records.timeline.TimelineEntities;
 import org.apache.hadoop.yarn.api.records.timelineservice.TimelineEntity;
 import org.apache.hadoop.yarn.api.records.timelineservice.TimelineEntityType;
+import org.apache.hadoop.yarn.api.records.timelineservice.TimelineEvent;
 import org.apache.hadoop.yarn.applications.distributedshell.ApplicationMaster.DSEvent;
 import org.apache.hadoop.yarn.client.api.YarnClient;
 import org.apache.hadoop.yarn.client.api.impl.DirectTimelineWriter;
@@ -665,6 +666,15 @@ public class TestDistributedShell {
           if (entityLine.contains(expectedEvent)) {
             actualCount++;
           }
+          if (expectedEvent.equals(DSEvent.DS_CONTAINER_END.toString()) &&
+              entityLine.contains(expectedEvent)) {
+            TimelineEntity entity = FileSystemTimelineReaderImpl.
+                getTimelineRecordFromJSON(entityLine, TimelineEntity.class);
+            TimelineEvent event = entity.getEvents().pollFirst();
+            Assert.assertNotNull(event);
+            Assert.assertTrue("diagnostics",
+                event.getInfo().containsKey(ApplicationMaster.DIAGNOSTICS));
+          }
           if (checkIdPrefix) {
             TimelineEntity entity = FileSystemTimelineReaderImpl.
                 getTimelineRecordFromJSON(entityLine, TimelineEntity.class);


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org