You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@slider.apache.org by st...@apache.org on 2015/05/05 00:09:19 UTC

[2/3] incubator-slider git commit: SLIDER-856 pre-emption is not treated as failure; special handling for node failures (not treated as role failures), and container limits exceeded (not treated as node failures). Counters in RoleStatus and NodeEntry for

SLIDER-856 pre-emption is not treated as failure; special handling for node failures (not treated as role failures), and container limits exceeded (not treated as node failures). Counters in RoleStatus and NodeEntry for this.


Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/0a50c48e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/0a50c48e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/0a50c48e

Branch: refs/heads/develop
Commit: 0a50c48e713a2df27efd226215c9f748e4335708
Parents: 3d61bed
Author: Steve Loughran <st...@apache.org>
Authored: Mon May 4 23:08:32 2015 +0100
Committer: Steve Loughran <st...@apache.org>
Committed: Mon May 4 23:08:32 2015 +0100

----------------------------------------------------------------------
 .../apache/slider/api/ClusterDescription.java   |  28 +-
 .../java/org/apache/slider/api/RoleKeys.java    |  17 +-
 .../java/org/apache/slider/api/StatusKeys.java  |   3 +
 .../org/apache/slider/api/proto/Messages.java   | 342 +++++++++++++++++--
 .../slider/api/proto/RestTypeMarshalling.java   |   9 +-
 .../slider/api/types/ComponentInformation.java  |  14 +-
 .../server/appmaster/SliderAppMaster.java       |   2 -
 .../slider/server/appmaster/state/AppState.java |  55 ++-
 .../appmaster/state/ContainerOutcome.java       |  61 ++++
 .../server/appmaster/state/NodeEntry.java       |  36 +-
 .../server/appmaster/state/RoleHistory.java     |  19 +-
 .../server/appmaster/state/RoleStatus.java      |  74 +++-
 .../src/main/proto/SliderClusterMessages.proto  |   5 +-
 .../TestMockAppStateContainerFailure.groovy     | 132 ++++++-
 .../TestRoleHistoryContainerEvents.groovy       |  10 +-
 ...stRoleHistoryFindNodesForNewInstances.groovy |  10 +-
 .../model/mock/BaseMockAppStateTest.groovy      |   5 +-
 .../appmaster/web/view/TestIndexBlock.groovy    |   6 +-
 18 files changed, 725 insertions(+), 103 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/ClusterDescription.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/ClusterDescription.java b/slider-core/src/main/java/org/apache/slider/api/ClusterDescription.java
index 7db7e7f..025bd32 100644
--- a/slider-core/src/main/java/org/apache/slider/api/ClusterDescription.java
+++ b/slider-core/src/main/java/org/apache/slider/api/ClusterDescription.java
@@ -594,7 +594,7 @@ public class ClusterDescription implements Cloneable {
   }
 
   /**
-   * Get a role opt; use {@link Integer#decode(String)} so as to take hex
+   * Get an integer role option; use {@link Integer#decode(String)} so as to take hex
    * oct and bin values too.
    *
    * @param role role to get from
@@ -609,6 +609,21 @@ public class ClusterDescription implements Cloneable {
   }
 
   /**
+   * Get an integer role option; use {@link Integer#decode(String)} so as to take hex
+   * oct and bin values too.
+   *
+   * @param role role to get from
+   * @param option option name
+   * @param defVal default value
+   * @return parsed value
+   * @throws NumberFormatException if the role could not be parsed.
+   */
+  public long getRoleOptLong(String role, String option, long defVal) {
+    String val = getRoleOpt(role, option, Long.toString(defVal));
+    return Long.decode(val);
+  }
+
+  /**
    * Set a role option, creating the role if necessary
    * @param role role name
    * @param option option name
@@ -630,6 +645,17 @@ public class ClusterDescription implements Cloneable {
   }
 
   /**
+   * Set a role option of any object, using its string value.
+   * This works for (Boxed) numeric values as well as other objects
+   * @param role role name
+   * @param option option name
+   * @param val non-null value
+   */
+  public void setRoleOpt(String role, String option, Object val) {
+    setRoleOpt(role, option, val.toString());
+  }
+
+  /**
    * Get the value of a role requirement (cores, RAM, etc).
    * These are returned as integers, but there is special handling of the 
    * string {@link ResourceKeys#YARN_RESOURCE_MAX}, which triggers

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/RoleKeys.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/RoleKeys.java b/slider-core/src/main/java/org/apache/slider/api/RoleKeys.java
index 0f0fb8c..4512354 100644
--- a/slider-core/src/main/java/org/apache/slider/api/RoleKeys.java
+++ b/slider-core/src/main/java/org/apache/slider/api/RoleKeys.java
@@ -45,11 +45,26 @@ public interface RoleKeys {
   String ROLE_RELEASING_INSTANCES = "role.releasing.instances";
 
   /**
-   * Status report: number currently being released: {@value} 
+   * Status report: total number that have failed: {@value}
    */
   String ROLE_FAILED_INSTANCES = "role.failed.instances";
 
   /**
+   * Status report: number that have failed recently: {@value}
+   */
+  String ROLE_FAILED_RECENTLY_INSTANCES = "role.failed.recently.instances";
+
+  /**
+   * Status report: number that have failed for node-related issues: {@value}
+   */
+  String ROLE_NODE_FAILED_INSTANCES = "role.failed.node.instances";
+
+  /**
+   * Status report: number that been pre-empted: {@value}
+   */
+  String ROLE_PREEMPTED_INSTANCES = "role.failed.preempted.instances";
+
+  /**
    * Status report: number currently being released: {@value} 
    */
   String ROLE_FAILED_STARTING_INSTANCES = "role.failed.starting.instances";

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/StatusKeys.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/StatusKeys.java b/slider-core/src/main/java/org/apache/slider/api/StatusKeys.java
index ef68aad..6a0a2fa 100644
--- a/slider-core/src/main/java/org/apache/slider/api/StatusKeys.java
+++ b/slider-core/src/main/java/org/apache/slider/api/StatusKeys.java
@@ -27,6 +27,9 @@ public interface StatusKeys {
   String STATISTICS_CONTAINERS_COMPLETED = "containers.completed";
   String STATISTICS_CONTAINERS_DESIRED = "containers.desired";
   String STATISTICS_CONTAINERS_FAILED = "containers.failed";
+  String STATISTICS_CONTAINERS_FAILED_RECENTLY = "containers.failed.recently";
+  String STATISTICS_CONTAINERS_FAILED_NODE = "containers.failed.node";
+  String STATISTICS_CONTAINERS_PREEMPTED = "containers.failed.preempted";
   String STATISTICS_CONTAINERS_LIVE = "containers.live";
   String STATISTICS_CONTAINERS_REQUESTED = "containers.requested";
   String STATISTICS_CONTAINERS_STARTED = "containers.start.started";

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/proto/Messages.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/proto/Messages.java b/slider-core/src/main/java/org/apache/slider/api/proto/Messages.java
index a6c8da8..6c92f46 100644
--- a/slider-core/src/main/java/org/apache/slider/api/proto/Messages.java
+++ b/slider-core/src/main/java/org/apache/slider/api/proto/Messages.java
@@ -15070,6 +15070,36 @@ public final class Messages {
      */
     com.google.protobuf.ByteString
         getContainersBytes(int index);
+
+    // optional int32 failedRecently = 15;
+    /**
+     * <code>optional int32 failedRecently = 15;</code>
+     */
+    boolean hasFailedRecently();
+    /**
+     * <code>optional int32 failedRecently = 15;</code>
+     */
+    int getFailedRecently();
+
+    // optional int32 nodeFailed = 16;
+    /**
+     * <code>optional int32 nodeFailed = 16;</code>
+     */
+    boolean hasNodeFailed();
+    /**
+     * <code>optional int32 nodeFailed = 16;</code>
+     */
+    int getNodeFailed();
+
+    // optional int32 preempted = 17;
+    /**
+     * <code>optional int32 preempted = 17;</code>
+     */
+    boolean hasPreempted();
+    /**
+     * <code>optional int32 preempted = 17;</code>
+     */
+    int getPreempted();
   }
   /**
    * Protobuf type {@code org.apache.slider.api.ComponentInformationProto}
@@ -15200,6 +15230,21 @@ public final class Messages {
               containers_.add(input.readBytes());
               break;
             }
+            case 120: {
+              bitField0_ |= 0x00002000;
+              failedRecently_ = input.readInt32();
+              break;
+            }
+            case 128: {
+              bitField0_ |= 0x00004000;
+              nodeFailed_ = input.readInt32();
+              break;
+            }
+            case 136: {
+              bitField0_ |= 0x00008000;
+              preempted_ = input.readInt32();
+              break;
+            }
           }
         }
       } catch (com.google.protobuf.InvalidProtocolBufferException e) {
@@ -15535,6 +15580,54 @@ public final class Messages {
       return containers_.getByteString(index);
     }
 
+    // optional int32 failedRecently = 15;
+    public static final int FAILEDRECENTLY_FIELD_NUMBER = 15;
+    private int failedRecently_;
+    /**
+     * <code>optional int32 failedRecently = 15;</code>
+     */
+    public boolean hasFailedRecently() {
+      return ((bitField0_ & 0x00002000) == 0x00002000);
+    }
+    /**
+     * <code>optional int32 failedRecently = 15;</code>
+     */
+    public int getFailedRecently() {
+      return failedRecently_;
+    }
+
+    // optional int32 nodeFailed = 16;
+    public static final int NODEFAILED_FIELD_NUMBER = 16;
+    private int nodeFailed_;
+    /**
+     * <code>optional int32 nodeFailed = 16;</code>
+     */
+    public boolean hasNodeFailed() {
+      return ((bitField0_ & 0x00004000) == 0x00004000);
+    }
+    /**
+     * <code>optional int32 nodeFailed = 16;</code>
+     */
+    public int getNodeFailed() {
+      return nodeFailed_;
+    }
+
+    // optional int32 preempted = 17;
+    public static final int PREEMPTED_FIELD_NUMBER = 17;
+    private int preempted_;
+    /**
+     * <code>optional int32 preempted = 17;</code>
+     */
+    public boolean hasPreempted() {
+      return ((bitField0_ & 0x00008000) == 0x00008000);
+    }
+    /**
+     * <code>optional int32 preempted = 17;</code>
+     */
+    public int getPreempted() {
+      return preempted_;
+    }
+
     private void initFields() {
       name_ = "";
       priority_ = 0;
@@ -15550,6 +15643,9 @@ public final class Messages {
       failureMessage_ = "";
       placementPolicy_ = 0;
       containers_ = com.google.protobuf.LazyStringArrayList.EMPTY;
+      failedRecently_ = 0;
+      nodeFailed_ = 0;
+      preempted_ = 0;
     }
     private byte memoizedIsInitialized = -1;
     public final boolean isInitialized() {
@@ -15605,6 +15701,15 @@ public final class Messages {
       for (int i = 0; i < containers_.size(); i++) {
         output.writeBytes(14, containers_.getByteString(i));
       }
+      if (((bitField0_ & 0x00002000) == 0x00002000)) {
+        output.writeInt32(15, failedRecently_);
+      }
+      if (((bitField0_ & 0x00004000) == 0x00004000)) {
+        output.writeInt32(16, nodeFailed_);
+      }
+      if (((bitField0_ & 0x00008000) == 0x00008000)) {
+        output.writeInt32(17, preempted_);
+      }
       getUnknownFields().writeTo(output);
     }
 
@@ -15675,6 +15780,18 @@ public final class Messages {
         size += dataSize;
         size += 1 * getContainersList().size();
       }
+      if (((bitField0_ & 0x00002000) == 0x00002000)) {
+        size += com.google.protobuf.CodedOutputStream
+          .computeInt32Size(15, failedRecently_);
+      }
+      if (((bitField0_ & 0x00004000) == 0x00004000)) {
+        size += com.google.protobuf.CodedOutputStream
+          .computeInt32Size(16, nodeFailed_);
+      }
+      if (((bitField0_ & 0x00008000) == 0x00008000)) {
+        size += com.google.protobuf.CodedOutputStream
+          .computeInt32Size(17, preempted_);
+      }
       size += getUnknownFields().getSerializedSize();
       memoizedSerializedSize = size;
       return size;
@@ -15765,6 +15882,21 @@ public final class Messages {
       }
       result = result && getContainersList()
           .equals(other.getContainersList());
+      result = result && (hasFailedRecently() == other.hasFailedRecently());
+      if (hasFailedRecently()) {
+        result = result && (getFailedRecently()
+            == other.getFailedRecently());
+      }
+      result = result && (hasNodeFailed() == other.hasNodeFailed());
+      if (hasNodeFailed()) {
+        result = result && (getNodeFailed()
+            == other.getNodeFailed());
+      }
+      result = result && (hasPreempted() == other.hasPreempted());
+      if (hasPreempted()) {
+        result = result && (getPreempted()
+            == other.getPreempted());
+      }
       result = result &&
           getUnknownFields().equals(other.getUnknownFields());
       return result;
@@ -15834,6 +15966,18 @@ public final class Messages {
         hash = (37 * hash) + CONTAINERS_FIELD_NUMBER;
         hash = (53 * hash) + getContainersList().hashCode();
       }
+      if (hasFailedRecently()) {
+        hash = (37 * hash) + FAILEDRECENTLY_FIELD_NUMBER;
+        hash = (53 * hash) + getFailedRecently();
+      }
+      if (hasNodeFailed()) {
+        hash = (37 * hash) + NODEFAILED_FIELD_NUMBER;
+        hash = (53 * hash) + getNodeFailed();
+      }
+      if (hasPreempted()) {
+        hash = (37 * hash) + PREEMPTED_FIELD_NUMBER;
+        hash = (53 * hash) + getPreempted();
+      }
       hash = (29 * hash) + getUnknownFields().hashCode();
       memoizedHashCode = hash;
       return hash;
@@ -15976,6 +16120,12 @@ public final class Messages {
         bitField0_ = (bitField0_ & ~0x00001000);
         containers_ = com.google.protobuf.LazyStringArrayList.EMPTY;
         bitField0_ = (bitField0_ & ~0x00002000);
+        failedRecently_ = 0;
+        bitField0_ = (bitField0_ & ~0x00004000);
+        nodeFailed_ = 0;
+        bitField0_ = (bitField0_ & ~0x00008000);
+        preempted_ = 0;
+        bitField0_ = (bitField0_ & ~0x00010000);
         return this;
       }
 
@@ -16062,6 +16212,18 @@ public final class Messages {
           bitField0_ = (bitField0_ & ~0x00002000);
         }
         result.containers_ = containers_;
+        if (((from_bitField0_ & 0x00004000) == 0x00004000)) {
+          to_bitField0_ |= 0x00002000;
+        }
+        result.failedRecently_ = failedRecently_;
+        if (((from_bitField0_ & 0x00008000) == 0x00008000)) {
+          to_bitField0_ |= 0x00004000;
+        }
+        result.nodeFailed_ = nodeFailed_;
+        if (((from_bitField0_ & 0x00010000) == 0x00010000)) {
+          to_bitField0_ |= 0x00008000;
+        }
+        result.preempted_ = preempted_;
         result.bitField0_ = to_bitField0_;
         onBuilt();
         return result;
@@ -16131,6 +16293,15 @@ public final class Messages {
           }
           onChanged();
         }
+        if (other.hasFailedRecently()) {
+          setFailedRecently(other.getFailedRecently());
+        }
+        if (other.hasNodeFailed()) {
+          setNodeFailed(other.getNodeFailed());
+        }
+        if (other.hasPreempted()) {
+          setPreempted(other.getPreempted());
+        }
         this.mergeUnknownFields(other.getUnknownFields());
         return this;
       }
@@ -16762,6 +16933,105 @@ public final class Messages {
         return this;
       }
 
+      // optional int32 failedRecently = 15;
+      private int failedRecently_ ;
+      /**
+       * <code>optional int32 failedRecently = 15;</code>
+       */
+      public boolean hasFailedRecently() {
+        return ((bitField0_ & 0x00004000) == 0x00004000);
+      }
+      /**
+       * <code>optional int32 failedRecently = 15;</code>
+       */
+      public int getFailedRecently() {
+        return failedRecently_;
+      }
+      /**
+       * <code>optional int32 failedRecently = 15;</code>
+       */
+      public Builder setFailedRecently(int value) {
+        bitField0_ |= 0x00004000;
+        failedRecently_ = value;
+        onChanged();
+        return this;
+      }
+      /**
+       * <code>optional int32 failedRecently = 15;</code>
+       */
+      public Builder clearFailedRecently() {
+        bitField0_ = (bitField0_ & ~0x00004000);
+        failedRecently_ = 0;
+        onChanged();
+        return this;
+      }
+
+      // optional int32 nodeFailed = 16;
+      private int nodeFailed_ ;
+      /**
+       * <code>optional int32 nodeFailed = 16;</code>
+       */
+      public boolean hasNodeFailed() {
+        return ((bitField0_ & 0x00008000) == 0x00008000);
+      }
+      /**
+       * <code>optional int32 nodeFailed = 16;</code>
+       */
+      public int getNodeFailed() {
+        return nodeFailed_;
+      }
+      /**
+       * <code>optional int32 nodeFailed = 16;</code>
+       */
+      public Builder setNodeFailed(int value) {
+        bitField0_ |= 0x00008000;
+        nodeFailed_ = value;
+        onChanged();
+        return this;
+      }
+      /**
+       * <code>optional int32 nodeFailed = 16;</code>
+       */
+      public Builder clearNodeFailed() {
+        bitField0_ = (bitField0_ & ~0x00008000);
+        nodeFailed_ = 0;
+        onChanged();
+        return this;
+      }
+
+      // optional int32 preempted = 17;
+      private int preempted_ ;
+      /**
+       * <code>optional int32 preempted = 17;</code>
+       */
+      public boolean hasPreempted() {
+        return ((bitField0_ & 0x00010000) == 0x00010000);
+      }
+      /**
+       * <code>optional int32 preempted = 17;</code>
+       */
+      public int getPreempted() {
+        return preempted_;
+      }
+      /**
+       * <code>optional int32 preempted = 17;</code>
+       */
+      public Builder setPreempted(int value) {
+        bitField0_ |= 0x00010000;
+        preempted_ = value;
+        onChanged();
+        return this;
+      }
+      /**
+       * <code>optional int32 preempted = 17;</code>
+       */
+      public Builder clearPreempted() {
+        bitField0_ = (bitField0_ & ~0x00010000);
+        preempted_ = 0;
+        onChanged();
+        return this;
+      }
+
       // @@protoc_insertion_point(builder_scope:org.apache.slider.api.ComponentInformationProto)
     }
 
@@ -28687,46 +28957,48 @@ public final class Messages {
       " \002(\t\022\023\n\013application\030\003 \002(\t\"`\n#Application" +
       "LivenessInformationProto\022\034\n\024allRequestsS" +
       "atisfied\030\001 \001(\010\022\033\n\023requestsOutstanding\030\002 " +
-      "\001(\005\"\250\002\n\031ComponentInformationProto\022\014\n\004nam",
+      "\001(\005\"\347\002\n\031ComponentInformationProto\022\014\n\004nam",
       "e\030\001 \001(\t\022\020\n\010priority\030\002 \001(\005\022\017\n\007desired\030\003 \001" +
       "(\005\022\016\n\006actual\030\004 \001(\005\022\021\n\treleasing\030\005 \001(\005\022\021\n" +
       "\trequested\030\006 \001(\005\022\016\n\006failed\030\007 \001(\005\022\017\n\007star" +
       "ted\030\010 \001(\005\022\023\n\013startFailed\030\t \001(\005\022\021\n\tcomple" +
       "ted\030\n \001(\005\022\026\n\016totalRequested\030\013 \001(\005\022\026\n\016fai" +
       "lureMessage\030\014 \001(\t\022\027\n\017placementPolicy\030\r \001" +
-      "(\005\022\022\n\ncontainers\030\016 \003(\t\"\210\002\n\031ContainerInfo" +
-      "rmationProto\022\023\n\013containerId\030\001 \001(\t\022\021\n\tcom" +
-      "ponent\030\002 \001(\t\022\020\n\010released\030\003 \001(\010\022\r\n\005state\030" +
-      "\004 \001(\005\022\020\n\010exitCode\030\005 \001(\005\022\023\n\013diagnostics\030\006",
-      " \001(\t\022\022\n\ncreateTime\030\007 \001(\003\022\021\n\tstartTime\030\010 " +
-      "\001(\003\022\016\n\006output\030\t \003(\t\022\014\n\004host\030\n \001(\t\022\017\n\007hos" +
-      "tURL\030\013 \001(\t\022\021\n\tplacement\030\014 \001(\t\022\022\n\nappVers" +
-      "ion\030\r \001(\t\"N\n\024PingInformationProto\022\014\n\004tex" +
-      "t\030\001 \001(\t\022\014\n\004verb\030\002 \001(\t\022\014\n\004body\030\003 \001(\t\022\014\n\004t" +
-      "ime\030\004 \001(\003\"\026\n\024GetModelRequestProto\"\035\n\033Get" +
-      "ModelDesiredRequestProto\"$\n\"GetModelDesi" +
-      "redAppconfRequestProto\"&\n$GetModelDesire" +
-      "dResourcesRequestProto\"%\n#GetModelResolv" +
-      "edAppconfRequestProto\"\'\n%GetModelResolve",
-      "dResourcesRequestProto\"#\n!GetModelLiveRe" +
-      "sourcesRequestProto\"\037\n\035GetLiveContainers" +
-      "RequestProto\"u\n\036GetLiveContainersRespons" +
-      "eProto\022\r\n\005names\030\001 \003(\t\022D\n\ncontainers\030\002 \003(" +
-      "\01320.org.apache.slider.api.ContainerInfor" +
-      "mationProto\"3\n\034GetLiveContainerRequestPr" +
-      "oto\022\023\n\013containerId\030\001 \002(\t\"\037\n\035GetLiveCompo" +
-      "nentsRequestProto\"u\n\036GetLiveComponentsRe" +
-      "sponseProto\022\r\n\005names\030\001 \003(\t\022D\n\ncomponents" +
-      "\030\002 \003(\01320.org.apache.slider.api.Component",
-      "InformationProto\",\n\034GetLiveComponentRequ" +
-      "estProto\022\014\n\004name\030\001 \002(\t\"$\n\"GetApplication" +
-      "LivenessRequestProto\"\023\n\021EmptyPayloadProt" +
-      "o\" \n\020WrappedJsonProto\022\014\n\004json\030\001 \002(\t\"h\n\037G" +
-      "etCertificateStoreRequestProto\022\020\n\010hostna" +
-      "me\030\001 \001(\t\022\023\n\013requesterId\030\002 \002(\t\022\020\n\010passwor" +
-      "d\030\003 \002(\t\022\014\n\004type\030\004 \002(\t\"1\n GetCertificateS" +
-      "toreResponseProto\022\r\n\005store\030\001 \002(\014B-\n\033org." +
-      "apache.slider.api.protoB\010Messages\210\001\001\240\001\001"
+      "(\005\022\022\n\ncontainers\030\016 \003(\t\022\026\n\016failedRecently" +
+      "\030\017 \001(\005\022\022\n\nnodeFailed\030\020 \001(\005\022\021\n\tpreempted\030" +
+      "\021 \001(\005\"\210\002\n\031ContainerInformationProto\022\023\n\013c" +
+      "ontainerId\030\001 \001(\t\022\021\n\tcomponent\030\002 \001(\t\022\020\n\010r",
+      "eleased\030\003 \001(\010\022\r\n\005state\030\004 \001(\005\022\020\n\010exitCode" +
+      "\030\005 \001(\005\022\023\n\013diagnostics\030\006 \001(\t\022\022\n\ncreateTim" +
+      "e\030\007 \001(\003\022\021\n\tstartTime\030\010 \001(\003\022\016\n\006output\030\t \003" +
+      "(\t\022\014\n\004host\030\n \001(\t\022\017\n\007hostURL\030\013 \001(\t\022\021\n\tpla" +
+      "cement\030\014 \001(\t\022\022\n\nappVersion\030\r \001(\t\"N\n\024Ping" +
+      "InformationProto\022\014\n\004text\030\001 \001(\t\022\014\n\004verb\030\002" +
+      " \001(\t\022\014\n\004body\030\003 \001(\t\022\014\n\004time\030\004 \001(\003\"\026\n\024GetM" +
+      "odelRequestProto\"\035\n\033GetModelDesiredReque" +
+      "stProto\"$\n\"GetModelDesiredAppconfRequest" +
+      "Proto\"&\n$GetModelDesiredResourcesRequest",
+      "Proto\"%\n#GetModelResolvedAppconfRequestP" +
+      "roto\"\'\n%GetModelResolvedResourcesRequest" +
+      "Proto\"#\n!GetModelLiveResourcesRequestPro" +
+      "to\"\037\n\035GetLiveContainersRequestProto\"u\n\036G" +
+      "etLiveContainersResponseProto\022\r\n\005names\030\001" +
+      " \003(\t\022D\n\ncontainers\030\002 \003(\01320.org.apache.sl" +
+      "ider.api.ContainerInformationProto\"3\n\034Ge" +
+      "tLiveContainerRequestProto\022\023\n\013containerI" +
+      "d\030\001 \002(\t\"\037\n\035GetLiveComponentsRequestProto" +
+      "\"u\n\036GetLiveComponentsResponseProto\022\r\n\005na",
+      "mes\030\001 \003(\t\022D\n\ncomponents\030\002 \003(\01320.org.apac" +
+      "he.slider.api.ComponentInformationProto\"" +
+      ",\n\034GetLiveComponentRequestProto\022\014\n\004name\030" +
+      "\001 \002(\t\"$\n\"GetApplicationLivenessRequestPr" +
+      "oto\"\023\n\021EmptyPayloadProto\" \n\020WrappedJsonP" +
+      "roto\022\014\n\004json\030\001 \002(\t\"h\n\037GetCertificateStor" +
+      "eRequestProto\022\020\n\010hostname\030\001 \001(\t\022\023\n\013reque" +
+      "sterId\030\002 \002(\t\022\020\n\010password\030\003 \002(\t\022\014\n\004type\030\004" +
+      " \002(\t\"1\n GetCertificateStoreResponseProto" +
+      "\022\r\n\005store\030\001 \002(\014B-\n\033org.apache.slider.api",
+      ".protoB\010Messages\210\001\001\240\001\001"
     };
     com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner =
       new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() {
@@ -28882,7 +29154,7 @@ public final class Messages {
           internal_static_org_apache_slider_api_ComponentInformationProto_fieldAccessorTable = new
             com.google.protobuf.GeneratedMessage.FieldAccessorTable(
               internal_static_org_apache_slider_api_ComponentInformationProto_descriptor,
-              new java.lang.String[] { "Name", "Priority", "Desired", "Actual", "Releasing", "Requested", "Failed", "Started", "StartFailed", "Completed", "TotalRequested", "FailureMessage", "PlacementPolicy", "Containers", });
+              new java.lang.String[] { "Name", "Priority", "Desired", "Actual", "Releasing", "Requested", "Failed", "Started", "StartFailed", "Completed", "TotalRequested", "FailureMessage", "PlacementPolicy", "Containers", "FailedRecently", "NodeFailed", "Preempted", });
           internal_static_org_apache_slider_api_ContainerInformationProto_descriptor =
             getDescriptor().getMessageTypes().get(25);
           internal_static_org_apache_slider_api_ContainerInformationProto_fieldAccessorTable = new

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/proto/RestTypeMarshalling.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/proto/RestTypeMarshalling.java b/slider-core/src/main/java/org/apache/slider/api/proto/RestTypeMarshalling.java
index f005bc3..c408ed2 100644
--- a/slider-core/src/main/java/org/apache/slider/api/proto/RestTypeMarshalling.java
+++ b/slider-core/src/main/java/org/apache/slider/api/proto/RestTypeMarshalling.java
@@ -78,7 +78,7 @@ public class RestTypeMarshalling {
     info.started = wire.getStarted();
     info.startFailed = wire.getStartFailed();
     info.totalRequested = wire.getTotalRequested();
-    info.containers = new ArrayList<String>(wire.getContainersList());
+    info.containers = new ArrayList<>(wire.getContainersList());
     if (wire.hasFailureMessage()) {
       info.failureMessage = wire.getFailureMessage();
     }
@@ -95,6 +95,7 @@ public class RestTypeMarshalling {
     return builder.build();
   }
 
+  @SuppressWarnings("IOResourceOpenedButNotSafelyClosed")
   private static byte[] getStoreBytes(SecurityStore securityStore)
       throws IOException {
     InputStream is = new FileInputStream(securityStore.getFile());
@@ -105,8 +106,7 @@ public class RestTypeMarshalling {
     return response.getStore().toByteArray();
   }
 
-  public static Messages.ComponentInformationProto
-  marshall(ComponentInformation info) {
+  public static Messages.ComponentInformationProto marshall(ComponentInformation info) {
 
     Messages.ComponentInformationProto.Builder builder =
         Messages.ComponentInformationProto.newBuilder();
@@ -123,6 +123,9 @@ public class RestTypeMarshalling {
     builder.setStarted(info.started);
     builder.setStartFailed(info.startFailed);
     builder.setTotalRequested(info.totalRequested);
+    builder.setNodeFailed(info.nodeFailed);
+    builder.setPreempted(info.preempted);
+    builder.setFailedRecently(info.failedRecently);
     if (info.failureMessage != null) {
       builder.setFailureMessage(info.failureMessage);
     }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/types/ComponentInformation.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/types/ComponentInformation.java b/slider-core/src/main/java/org/apache/slider/api/types/ComponentInformation.java
index 286c3a1..52f6c08 100644
--- a/slider-core/src/main/java/org/apache/slider/api/types/ComponentInformation.java
+++ b/slider-core/src/main/java/org/apache/slider/api/types/ComponentInformation.java
@@ -28,6 +28,14 @@ import java.util.Map;
 
 /**
  * Serializable version of component data.
+ * <p>
+ * This is sent in REST calls as a JSON object —but is also marshalled into
+ * a protobuf structure. Look at {@link org.apache.slider.api.proto.RestTypeMarshalling}
+ * for the specifics there.
+ * <p>
+ * This means that if any fields are added here. they must be added to
+ * <code>src/main/proto/SliderClusterMessages.proto</code> and
+ * the probuf structures rebuilt.
  */
 @JsonIgnoreProperties(ignoreUnknown = true)
 @JsonSerialize(include = JsonSerialize.Inclusion.NON_NULL)
@@ -40,6 +48,7 @@ public class ComponentInformation {
   public int placementPolicy;
   public int requested;
   public int failed, started, startFailed, completed, totalRequested;
+  public int nodeFailed, failedRecently, preempted;
   public String failureMessage;
   public List<String> containers;
 
@@ -48,7 +57,7 @@ public class ComponentInformation {
    * @return a map for use in statistics reports
    */
   public Map<String, Integer> buildStatistics() {
-    Map<String, Integer> stats = new HashMap<String, Integer>();
+    Map<String, Integer> stats = new HashMap<>();
     stats.put(StatusKeys.STATISTICS_CONTAINERS_ACTIVE_REQUESTS, requested);
     stats.put(StatusKeys.STATISTICS_CONTAINERS_COMPLETED, completed);
     stats.put(StatusKeys.STATISTICS_CONTAINERS_DESIRED, desired);
@@ -57,6 +66,9 @@ public class ComponentInformation {
     stats.put(StatusKeys.STATISTICS_CONTAINERS_REQUESTED, totalRequested);
     stats.put(StatusKeys.STATISTICS_CONTAINERS_STARTED, started);
     stats.put(StatusKeys.STATISTICS_CONTAINERS_START_FAILED, startFailed);
+    stats.put(StatusKeys.STATISTICS_CONTAINERS_FAILED_RECENTLY, failedRecently);
+    stats.put(StatusKeys.STATISTICS_CONTAINERS_FAILED_NODE, nodeFailed);
+    stats.put(StatusKeys.STATISTICS_CONTAINERS_PREEMPTED, preempted);
     return stats;
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
index 3ab4501..7da627e 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
@@ -164,9 +164,7 @@ import org.apache.slider.server.services.yarnregistry.YarnRegistryViewForProvide
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.BufferedReader;
 import java.io.File;
-import java.io.FileReader;
 import java.io.IOException;
 import java.net.InetSocketAddress;
 import java.net.URI;

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
index 5428a34..18eb578 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
@@ -93,6 +93,9 @@ import static org.apache.slider.api.ResourceKeys.YARN_LABEL_EXPRESSION;
 import static org.apache.slider.api.ResourceKeys.YARN_MEMORY;
 import static org.apache.slider.api.RoleKeys.ROLE_FAILED_INSTANCES;
 import static org.apache.slider.api.RoleKeys.ROLE_FAILED_STARTING_INSTANCES;
+import static org.apache.slider.api.RoleKeys.ROLE_FAILED_RECENTLY_INSTANCES;
+import static org.apache.slider.api.RoleKeys.ROLE_NODE_FAILED_INSTANCES;
+import static org.apache.slider.api.RoleKeys.ROLE_PREEMPTED_INSTANCES;
 import static org.apache.slider.api.RoleKeys.ROLE_RELEASING_INSTANCES;
 import static org.apache.slider.api.RoleKeys.ROLE_REQUESTED_INSTANCES;
 import static org.apache.slider.api.StateValues.STATE_CREATED;
@@ -1430,7 +1433,7 @@ public class AppState {
         text = "container start failure";
       }
       instance.diagnostics = text;
-      roleStatus.noteFailed(true, null);
+      roleStatus.noteFailed(true, text, ContainerOutcome.Failed);
       getFailedNodes().put(containerId, instance);
       roleHistory.onNodeManagerContainerStartFailed(instance.container);
     }
@@ -1477,7 +1480,11 @@ public class AppState {
   public static class NodeCompletionResult {
     public boolean surplusNode = false;
     public RoleInstance roleInstance;
-    public boolean containerFailed;
+    // did the container fail for *any* reason?
+    public boolean containerFailed = false;
+    // detailed outcome on the container failure
+    public ContainerOutcome outcome = ContainerOutcome.Completed;
+    public int exitStatus = 0;
     public boolean unknownNode = false;
 
   
@@ -1486,7 +1493,9 @@ public class AppState {
         new StringBuilder("NodeCompletionResult{");
       sb.append("surplusNode=").append(surplusNode);
       sb.append(", roleInstance=").append(roleInstance);
+      sb.append(", exitStatus=").append(exitStatus);
       sb.append(", containerFailed=").append(containerFailed);
+      sb.append(", outcome=").append(outcome);
       sb.append(", unknownNode=").append(unknownNode);
       sb.append('}');
       return sb.toString();
@@ -1504,6 +1513,8 @@ public class AppState {
     NodeCompletionResult result = new NodeCompletionResult();
     RoleInstance roleInstance;
 
+    int exitStatus = status.getExitStatus();
+    result.exitStatus = exitStatus;
     if (containersBeingReleased.containsKey(containerId)) {
       log.info("Container was queued for release : {}", containerId);
       Container container = containersBeingReleased.remove(containerId);
@@ -1516,14 +1527,18 @@ public class AppState {
           actual,
           releasing,
           completedCount);
+      result.outcome = ContainerOutcome.Completed;
       roleHistory.onReleaseCompleted(container);
 
     } else if (surplusNodes.remove(containerId)) {
       //its a surplus one being purged
       result.surplusNode = true;
     } else {
-      //a container has failed 
+      // a container has failed or been killed
+      // use the exit code to determine the outcome
       result.containerFailed = true;
+      result.outcome = ContainerOutcome.fromExitStatus(exitStatus);
+
       roleInstance = removeOwnedContainer(containerId);
       if (roleInstance != null) {
         //it was active, move it to failed 
@@ -1544,32 +1559,34 @@ public class AppState {
           boolean shortLived = isShortLived(roleInstance);
           String message;
           Container failedContainer = roleInstance.container;
-          
+
           //build the failure message
           if (failedContainer != null) {
             String completedLogsUrl = getLogsURLForContainer(failedContainer);
-            message = String.format("Failure %s on host %s: %s",
+            message = String.format("Failure %s on host %s (%d): %s",
                 roleInstance.getContainerId().toString(),
                 failedContainer.getNodeId().getHost(),
+                exitStatus,
                 completedLogsUrl);
           } else {
-            message = String.format("Failure %s", containerId);
+            message = String.format("Failure %s (%d)", containerId, exitStatus);
           }
-          int failed = roleStatus.noteFailed(shortLived, message);
+          roleStatus.noteFailed(shortLived, message, result.outcome);
+          long failed = roleStatus.getFailed();
           log.info("Current count of failed role[{}] {} =  {}",
               roleId, rolename, failed);
           if (failedContainer != null) {
-            roleHistory.onFailedContainer(failedContainer, shortLived);
+            roleHistory.onFailedContainer(failedContainer, shortLived, result.outcome);
           }
-          
+
         } catch (YarnRuntimeException e1) {
           log.error("Failed container of unknown role {}", roleId);
         }
       } else {
         //this isn't a known container.
-        
+
         log.error("Notified of completed container {} that is not in the list" +
-                  " of active or failed containers", containerId);
+            " of active or failed containers", containerId);
         completionOfUnknownContainerEvent.incrementAndGet();
         result.unknownNode = true;
       }
@@ -1587,7 +1604,7 @@ public class AppState {
     RoleInstance node = getLiveNodes().remove(id);
     if (node != null) {
       node.state = STATE_DESTROYED;
-      node.exitCode = status.getExitStatus();
+      node.exitCode = exitStatus;
       node.diagnostics = status.getDiagnostics();
       getCompletedNodes().put(id, node);
       result.roleInstance = node;
@@ -1595,7 +1612,6 @@ public class AppState {
       // not in the list
       log.warn("Received notification of completion of unknown node {}", id);
       completionOfNodeNotInLiveListEvent.incrementAndGet();
-
     }
     
     // and the active node list if present
@@ -1709,6 +1725,9 @@ public class AppState {
       cd.setRoleOpt(rolename, ROLE_RELEASING_INSTANCES, role.getReleasing());
       cd.setRoleOpt(rolename, ROLE_FAILED_INSTANCES, role.getFailed());
       cd.setRoleOpt(rolename, ROLE_FAILED_STARTING_INSTANCES, role.getStartFailed());
+      cd.setRoleOpt(rolename, ROLE_FAILED_RECENTLY_INSTANCES, role.getFailedRecently());
+      cd.setRoleOpt(rolename, ROLE_NODE_FAILED_INSTANCES, role.getNodeFailed());
+      cd.setRoleOpt(rolename, ROLE_PREEMPTED_INSTANCES, role.getPreempted());
       Map<String, Integer> stats = role.buildStatistics();
       cd.statistics.put(rolename, stats);
     }
@@ -1798,14 +1817,14 @@ public class AppState {
   }
 
   /**
-   * Check the failure threshold for a role
+   * Check the "recent" failure threshold for a role
    * @param role role to examine
    * @throws TriggerClusterTeardownException if the role
    * has failed too many times
    */
   private void checkFailureThreshold(RoleStatus role)
       throws TriggerClusterTeardownException {
-    int failures = role.getFailed();
+    long failures = role.getFailedRecently();
     int threshold = getFailureThresholdForRole(role);
     log.debug("Failure count of component: {}: {}, threshold={}",
         role.getName(), failures, threshold);
@@ -1814,7 +1833,7 @@ public class AppState {
       throw new TriggerClusterTeardownException(
         SliderExitCodes.EXIT_DEPLOYMENT_FAILED,
           FinalApplicationStatus.FAILED, ErrorStrings.E_UNSTABLE_CLUSTER +
-        " - failed with component %s failing %d times (%d in startup);" +
+        " - failed with component %s failed 'recently' %d times (%d in startup);" +
         " threshold is %d - last failure: %s",
           role.getName(),
         role.getFailed(),
@@ -1853,11 +1872,11 @@ public class AppState {
   }
 
   /**
-   * Reset the failure counts of all roles
+   * Reset the "recent" failure counts of all roles
    */
   public void resetFailureCounts() {
     for (RoleStatus roleStatus : getRoleStatusMap().values()) {
-      int failed = roleStatus.resetFailed();
+      long failed = roleStatus.resetFailedRecently();
       log.info("Resetting failure count of {}; was {}",
                roleStatus.getName(),
           failed);

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/state/ContainerOutcome.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/ContainerOutcome.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/ContainerOutcome.java
new file mode 100644
index 0000000..59ab30b
--- /dev/null
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/ContainerOutcome.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.slider.server.appmaster.state;
+
+import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
+
+/**
+ * Container outcomes we care about; slightly simplified from
+ * {@link ContainerExitStatus} -and hopefully able to handle
+ * any new exit codes.
+ */
+public enum ContainerOutcome {
+  Completed,
+  Failed,
+  Failed_limits_exceeded,
+  Node_failure,
+  Preempted;
+
+  /**
+   * Build a container outcome from an exit status.
+   * The values in {@link ContainerExitStatus} are used
+   * here.
+   * @param exitStatus exit status
+   * @return an enumeration of the outcome.
+   */
+  public static ContainerOutcome fromExitStatus(int exitStatus) {
+    switch (exitStatus) {
+      case ContainerExitStatus.ABORTED:
+      case ContainerExitStatus.KILLED_BY_APPMASTER:
+      case ContainerExitStatus.KILLED_BY_RESOURCEMANAGER:
+      case ContainerExitStatus.KILLED_AFTER_APP_COMPLETION:
+        // could either be a release or node failure. Treat as completion
+        return Completed;
+      case ContainerExitStatus.DISKS_FAILED:
+        return Node_failure;
+      case ContainerExitStatus.PREEMPTED:
+        return Preempted;
+      case ContainerExitStatus.KILLED_EXCEEDED_PMEM:
+      case ContainerExitStatus.KILLED_EXCEEDED_VMEM:
+        return Failed_limits_exceeded;
+      default:
+        return exitStatus == 0 ? Completed : Failed;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeEntry.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeEntry.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeEntry.java
index 0aa2d42..0f46054 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeEntry.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeEntry.java
@@ -53,6 +53,7 @@ public class NodeEntry {
   private int starting;
   private int startFailed;
   private int failed;
+  private int preempted;
   /**
    * Counter of "failed recently" events. These are all failures
    * which have happened since it was last reset.
@@ -138,8 +139,7 @@ public class NodeEntry {
   public synchronized boolean onStartFailed() {
     decStarting();
     ++startFailed;
-    ++failedRecently;
-    return containerCompleted(false);
+    return containerCompleted(false, ContainerOutcome.Failed);
   }
   
   /**
@@ -183,14 +183,35 @@ public class NodeEntry {
    * planned: dec our release count
    * unplanned: dec our live count
    * @param wasReleased true if this was planned
+   * @param outcome
    * @return true if this node is now available
    */
-  public synchronized boolean containerCompleted(boolean wasReleased) {
+  public synchronized boolean containerCompleted(boolean wasReleased, ContainerOutcome outcome) {
     if (wasReleased) {
       releasing = RoleHistoryUtils.decToFloor(releasing);
     } else {
-      ++failed;
-      ++failedRecently;
+      // for the node, we use the outcome of the faiure to decide
+      // whether this is potentially "node-related"
+      switch(outcome) {
+        // general "any reason" app failure
+        case Failed:
+        // specific node failure
+        case Node_failure:
+
+          ++failed;
+          ++failedRecently;
+          break;
+
+        case Preempted:
+          preempted++;
+          break;
+
+          // failures which are node-independent
+        case Failed_limits_exceeded:
+        case Completed:
+        default:
+          break;
+      }
     }
     decLive();
     return isAvailable();
@@ -219,6 +240,10 @@ public class NodeEntry {
     return failedRecently;
   }
 
+  public synchronized int getPreempted() {
+    return preempted;
+  }
+
   /**
    * Reset the failed recently count.
    */
@@ -236,6 +261,7 @@ public class NodeEntry {
     sb.append(", releasing=").append(releasing);
     sb.append(", lastUsed=").append(lastUsed);
     sb.append(", failedRecently=").append(failedRecently);
+    sb.append(", preempted=").append(preempted);
     sb.append(", startFailed=").append(startFailed);
     sb.append('}');
     return sb.toString();

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
index bb482af..926d440 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
@@ -767,7 +767,7 @@ public class RoleHistory {
    * @return true if the node was queued
    */
   public boolean onNodeManagerContainerStartFailed(Container container) {
-    return markContainerFinished(container, false, true);
+    return markContainerFinished(container, false, true, ContainerOutcome.Failed);
   }
 
   /**
@@ -808,7 +808,7 @@ public class RoleHistory {
    * @return true if the node was queued
    */
   public boolean onReleaseCompleted(Container container) {
-    return markContainerFinished(container, true, false);
+    return markContainerFinished(container, true, false, ContainerOutcome.Failed);
   }
 
   /**
@@ -817,10 +817,13 @@ public class RoleHistory {
    *
    * @param container completed container
    * @param shortLived was the container short lived?
+   * @param outcome
    * @return true if the node is considered available for work
    */
-  public boolean onFailedContainer(Container container, boolean shortLived) {
-    return markContainerFinished(container, false, shortLived);
+  public boolean onFailedContainer(Container container,
+      boolean shortLived,
+      ContainerOutcome outcome) {
+    return markContainerFinished(container, false, shortLived, outcome);
   }
 
   /**
@@ -831,11 +834,13 @@ public class RoleHistory {
    * @param container completed container
    * @param wasReleased was the container released?
    * @param shortLived was the container short lived?
+   * @param outcome
    * @return true if the node was queued
    */
   protected synchronized boolean markContainerFinished(Container container,
-                                                       boolean wasReleased,
-                                                       boolean shortLived) {
+      boolean wasReleased,
+      boolean shortLived,
+      ContainerOutcome outcome) {
     NodeEntry nodeEntry = getOrCreateNodeEntry(container);
     log.info("Finished container for node {}, released={}, shortlived={}",
         nodeEntry.rolePriority, wasReleased, shortLived);
@@ -844,7 +849,7 @@ public class RoleHistory {
       nodeEntry.onStartFailed();
       available = false;
     } else {
-      available = nodeEntry.containerCompleted(wasReleased);
+      available = nodeEntry.containerCompleted(wasReleased, outcome);
       maybeQueueNodeForWork(container, nodeEntry, available);
     }
     touch();

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
index 899948f..e2974fc 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
@@ -25,6 +25,7 @@ import org.apache.slider.providers.ProviderRole;
 import java.io.Serializable;
 import java.util.Comparator;
 import java.util.Map;
+import java.util.concurrent.atomic.AtomicLong;
 
 
 /**
@@ -43,7 +44,12 @@ public final class RoleStatus implements Cloneable {
   private final ProviderRole providerRole;
 
   private int desired, actual, requested, releasing;
-  private int failed, started, startFailed, completed, totalRequested;
+  private int failed, startFailed;
+  private int started,  completed, totalRequested;
+  private final AtomicLong preempted = new AtomicLong(0);
+  private final AtomicLong nodeFailed = new AtomicLong(0);
+  private final AtomicLong failedRecently = new AtomicLong(0);
+  private final AtomicLong limitsExceeded = new AtomicLong(0);
 
   private String failureMessage = "";
 
@@ -163,15 +169,20 @@ public final class RoleStatus implements Cloneable {
     return failed;
   }
 
+  public synchronized long getFailedRecently() {
+    return failedRecently.get();
+  }
+
   /**
-   * Reset the failure counts
-   * @return the total number of failures up to this point
+   * Reset the recent failure
+   * @return the number of failures in the "recent" window
    */
-  public synchronized int resetFailed() {
-    int total = failed + startFailed;
-    failed = 0;
-    startFailed = 0;
-    return total;
+  public long resetFailedRecently() {
+    return failedRecently.getAndSet(0);
+  }
+
+  public long getLimitsExceeded() {
+    return limitsExceeded.get();
   }
 
   /**
@@ -179,19 +190,37 @@ public final class RoleStatus implements Cloneable {
    * be used in any diagnostics if an exception
    * is later raised.
    * @param startupFailure flag to indicate this was a startup event
-   * @return the number of failures
    * @param text text about the failure
+   * @param outcome outcome of the container
    */
-  public synchronized int noteFailed(boolean startupFailure, String text) {
-    int current = ++failed;
+  public synchronized void noteFailed(boolean startupFailure, String text,
+      ContainerOutcome outcome) {
     if (text != null) {
       failureMessage = text;
     }
-    //have a look to see if it short lived
-    if (startupFailure) {
-      incStartFailed();
+    switch (outcome) {
+      case Preempted:
+        preempted.incrementAndGet();
+        break;
+
+      case Node_failure:
+        nodeFailed.incrementAndGet();
+        failed++;
+        break;
+
+      case Failed_limits_exceeded: // exceeded memory or CPU; app/configuration related
+        limitsExceeded.incrementAndGet();
+        // fall through
+      case Failed: // application failure, possibly node related, possibly not
+      default: // anything else (future-proofing)
+        failed++;
+        failedRecently.incrementAndGet();
+        //have a look to see if it short lived
+        if (startupFailure) {
+          incStartFailed();
+        }
+        break;
     }
-    return current;
   }
 
   public synchronized int getStartFailed() {
@@ -229,7 +258,14 @@ public final class RoleStatus implements Cloneable {
     return totalRequested;
   }
 
-  
+  public long getPreempted() {
+    return preempted.get();
+  }
+
+  public long getNodeFailed() {
+    return nodeFailed.get();
+  }
+
   /**
    * Get the number of roles we are short of.
    * nodes released are ignored.
@@ -267,6 +303,9 @@ public final class RoleStatus implements Cloneable {
            ", requested=" + requested +
            ", releasing=" + releasing +
            ", failed=" + failed +
+           ", failed recently=" + failedRecently.get() +
+           ", node failed=" + nodeFailed.get() +
+           ", pre-empted=" + preempted.get() +
            ", started=" + started +
            ", startFailed=" + startFailed +
            ", completed=" + completed +
@@ -313,6 +352,9 @@ public final class RoleStatus implements Cloneable {
     info.placementPolicy = getPlacementPolicy();
     info.failureMessage = failureMessage;
     info.totalRequested = totalRequested;
+    info.failedRecently = failedRecently.intValue();
+    info.nodeFailed = nodeFailed.intValue();
+    info.preempted = preempted.intValue();
     return info;
   }
   

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/proto/SliderClusterMessages.proto
----------------------------------------------------------------------
diff --git a/slider-core/src/main/proto/SliderClusterMessages.proto b/slider-core/src/main/proto/SliderClusterMessages.proto
index 5e770d5..caca87b 100644
--- a/slider-core/src/main/proto/SliderClusterMessages.proto
+++ b/slider-core/src/main/proto/SliderClusterMessages.proto
@@ -246,7 +246,10 @@ message ComponentInformationProto {
   optional int32 totalRequested =  11;
   optional string failureMessage = 12  ;
   optional int32 placementPolicy =  13;
-  repeated string containers =  14;   
+  repeated string containers =  14;
+  optional int32 failedRecently = 15;
+  optional int32 nodeFailed = 16;
+  optional int32 preempted = 17;
 }
 
 /*

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy
index 6368a3d..1b79115 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy
@@ -30,6 +30,7 @@ import org.apache.slider.server.appmaster.model.mock.BaseMockAppStateTest
 import org.apache.slider.server.appmaster.model.mock.MockRoles
 import org.apache.slider.server.appmaster.model.mock.MockYarnEngine
 import org.apache.slider.server.appmaster.state.AppState
+import org.apache.slider.server.appmaster.state.ContainerOutcome
 import org.apache.slider.server.appmaster.state.NodeEntry
 import org.apache.slider.server.appmaster.state.NodeInstance
 import org.apache.slider.server.appmaster.state.RoleHistory
@@ -87,7 +88,7 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest
     assert appState.isShortLived(instance)
     AppState.NodeCompletionResult result = appState.onCompletedNode(containerStatus(cid, 1))
     assert result.roleInstance != null
-    assert result.containerFailed 
+    assert result.containerFailed
     RoleStatus status = role0Status
     assert status.failed == 1
     assert status.startFailed == 1
@@ -141,8 +142,8 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest
     ContainerId cid = ids[0]
     appState.onNodeManagerContainerStartFailed(cid, new SliderException("oops"))
     RoleStatus status = role0Status
-    assert status.failed == 1
-    assert status.startFailed == 1
+    assert 1 == status.failed
+    assert 1 == status.startFailed
 
     
     RoleHistory history = appState.roleHistory
@@ -182,7 +183,7 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest
 
 
   @Test
-  public void testFailureWindow() throws Throwable {
+  public void testRoleStatusFailureWindow() throws Throwable {
 
     ResetFailureWindow resetter = new ResetFailureWindow();
 
@@ -209,4 +210,127 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest
       }
   }
 
+  @Test
+  public void testRoleStatusFailed() throws Throwable {
+    def status = role0Status
+    // limits exceeded
+    status.noteFailed(false, "text",ContainerOutcome.Failed)
+    assert 1 == status.failed
+    assert 1L == status.failedRecently
+    assert 0L == status.limitsExceeded
+    assert 0L == status.preempted
+    assert 0L == status.nodeFailed
+
+    ResetFailureWindow resetter = new ResetFailureWindow();
+    resetter.execute(null, null, appState)
+    assert 1 == status.failed
+    assert 0L == status.failedRecently
+  }
+
+  @Test
+  public void testRoleStatusFailedLimitsExceeded() throws Throwable {
+    def status = role0Status
+    // limits exceeded
+    status.noteFailed(false, "text",ContainerOutcome.Failed_limits_exceeded)
+    assert 1 == status.failed
+    assert 1L == status.failedRecently
+    assert 1L == status.limitsExceeded
+    assert 0L == status.preempted
+    assert 0L == status.nodeFailed
+
+    ResetFailureWindow resetter = new ResetFailureWindow();
+    resetter.execute(null, null, appState)
+    assert 1 == status.failed
+    assert 0L == status.failedRecently
+    assert 1L == status.limitsExceeded
+  }
+
+
+  @Test
+  public void testRoleStatusFailedPrempted() throws Throwable {
+    def status = role0Status
+    // limits exceeded
+    status.noteFailed(false, "text", ContainerOutcome.Preempted)
+    assert 0 == status.failed
+    assert 1L == status.preempted
+    assert 0L == status.failedRecently
+    assert 0L == status.nodeFailed
+
+    ResetFailureWindow resetter = new ResetFailureWindow();
+    resetter.execute(null, null, appState)
+    assert 1L == status.preempted
+  }
+
+
+  @Test
+  public void testRoleStatusFailedNode() throws Throwable {
+    def status = role0Status
+    // limits exceeded
+    status.noteFailed(false, "text", ContainerOutcome.Node_failure)
+    assert 1 == status.failed
+    assert 0L == status.failedRecently
+    assert 0L == status.limitsExceeded
+    assert 0L == status.preempted
+    assert 1L == status.nodeFailed
+  }
+
+  @Test
+  public void testNodeEntryCompleted() throws Throwable {
+    NodeEntry nodeEntry = new NodeEntry(1)
+    nodeEntry.containerCompleted(true, ContainerOutcome.Completed);
+    assert 0 == nodeEntry.failed
+    assert 0 == nodeEntry.failedRecently
+    assert 0 == nodeEntry.startFailed
+    assert 0 == nodeEntry.preempted
+    assert 0 == nodeEntry.active
+    assert nodeEntry.available
+  }
+
+  @Test
+  public void testNodeEntryFailed() throws Throwable {
+    NodeEntry nodeEntry = new NodeEntry(1)
+    nodeEntry.containerCompleted(false, ContainerOutcome.Failed);
+    assert 1 == nodeEntry.failed
+    assert 1 == nodeEntry.failedRecently
+    assert 0 == nodeEntry.startFailed
+    assert 0 == nodeEntry.preempted
+    assert 0 == nodeEntry.active
+    assert nodeEntry.available
+    nodeEntry.resetFailedRecently()
+    assert 1 == nodeEntry.failed
+    assert 0 == nodeEntry.failedRecently
+  }
+
+  @Test
+  public void testNodeEntryLimitsExceeded() throws Throwable {
+    NodeEntry nodeEntry = new NodeEntry(1)
+    nodeEntry.containerCompleted(false, ContainerOutcome.Failed_limits_exceeded);
+    assert 0 == nodeEntry.failed
+    assert 0 == nodeEntry.failedRecently
+    assert 0 == nodeEntry.startFailed
+    assert 0 == nodeEntry.preempted
+  }
+
+  @Test
+  public void testNodeEntryPreempted() throws Throwable {
+    NodeEntry nodeEntry = new NodeEntry(1)
+    nodeEntry.containerCompleted(false, ContainerOutcome.Preempted);
+    assert 0 == nodeEntry.failed
+    assert 0 == nodeEntry.failedRecently
+    assert 0 == nodeEntry.startFailed
+    assert 1 == nodeEntry.preempted
+  }
+
+  @Test
+  public void testNodeEntryNodeFailure() throws Throwable {
+    NodeEntry nodeEntry = new NodeEntry(1)
+    nodeEntry.containerCompleted(false, ContainerOutcome.Node_failure);
+    assert 1 == nodeEntry.failed
+    assert 1 == nodeEntry.failedRecently
+    assert 0 == nodeEntry.startFailed
+    assert 0 == nodeEntry.preempted
+  }
+
+
+
 }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
index c7a38f5..4fdf39c 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
@@ -301,7 +301,10 @@ class TestRoleHistoryContainerEvents extends BaseMockAppStateTest {
     roleHistory.onContainerStarted(container)
 
     //later, declare that it failed
-    roleHistory.onFailedContainer(container, false)
+    roleHistory.onFailedContainer(
+        container,
+        false,
+        ContainerOutcome.Failed)
     assert roleEntry.starting == 0
     assert roleEntry.available
     assert roleEntry.active == 0
@@ -330,7 +333,10 @@ class TestRoleHistoryContainerEvents extends BaseMockAppStateTest {
     NodeInstance allocated = nodemap.get(hostname)
     NodeEntry roleEntry = allocated.get(role)
     assert roleEntry.available
-    roleHistory.onFailedContainer(container, false)
+    roleHistory.onFailedContainer(
+        container,
+        false,
+        ContainerOutcome.Failed)
     assert roleEntry.starting == 0
     assert roleEntry.failed == 1
     assert roleEntry.available

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy
index b29a0b5..79d23e5 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy
@@ -23,7 +23,7 @@ import groovy.util.logging.Slf4j
 import org.apache.slider.providers.ProviderRole
 import org.apache.slider.server.appmaster.model.mock.BaseMockAppStateTest
 import org.apache.slider.server.appmaster.model.mock.MockFactory
-import org.apache.slider.server.appmaster.state.NodeEntry
+import org.apache.slider.server.appmaster.state.ContainerOutcome
 import org.apache.slider.server.appmaster.state.NodeInstance
 import org.apache.slider.server.appmaster.state.RoleHistory
 import org.apache.slider.server.appmaster.state.RoleStatus
@@ -132,10 +132,14 @@ class TestRoleHistoryFindNodesForNewInstances extends BaseMockAppStateTest {
     // mark age2 and active 0 as busy, expect a null back
 
     def entry0 = age2Active0.get(0)
-    entry0.containerCompleted(false)
+    entry0.containerCompleted(
+        false,
+        ContainerOutcome.Failed)
     assert entry0.failed
     assert entry0.failedRecently
-    entry0.containerCompleted(false)
+    entry0.containerCompleted(
+        false,
+        ContainerOutcome.Failed)
     assert !age2Active0.exceedsFailureThreshold(roleStat)
     // set failure to 1
     roleStat.providerRole.nodeFailureThreshold = 1

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy
index 3e5494f..29eefa5 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy
@@ -35,6 +35,7 @@ import org.apache.slider.core.main.LauncherExitCodes
 import org.apache.slider.server.appmaster.operations.AbstractRMOperation
 import org.apache.slider.server.appmaster.state.AppState
 import org.apache.slider.server.appmaster.state.ContainerAssignment
+import org.apache.slider.server.appmaster.state.ContainerOutcome
 import org.apache.slider.server.appmaster.state.NodeEntry
 import org.apache.slider.server.appmaster.state.NodeInstance
 import org.apache.slider.server.appmaster.state.RoleInstance
@@ -344,7 +345,9 @@ abstract class BaseMockAppStateTest extends SliderTestBase implements MockRoles
   public NodeEntry recordAsFailed(NodeInstance node, int id, int count) {
     def entry = node.getOrCreate(id)
     1.upto(count) {
-      entry.containerCompleted(false)
+      entry.containerCompleted(
+          false,
+          ContainerOutcome.Failed)
     }
     entry
   }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy
index 9ec230c..c2ea837 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy
@@ -19,13 +19,13 @@ package org.apache.slider.server.appmaster.web.view
 import com.google.inject.AbstractModule
 import com.google.inject.Guice
 import com.google.inject.Injector
-import groovy.transform.CompileStatic
 import groovy.util.logging.Slf4j
 import org.apache.hadoop.yarn.api.records.Container
 import org.apache.hadoop.yarn.api.records.Priority
 import org.apache.hadoop.yarn.webapp.hamlet.Hamlet
 import org.apache.slider.providers.ProviderService
 import org.apache.slider.server.appmaster.model.mock.*
+import org.apache.slider.server.appmaster.state.ContainerOutcome
 import org.apache.slider.server.appmaster.state.ProviderAppState
 import org.apache.slider.server.appmaster.web.WebAppApi
 import org.apache.slider.server.appmaster.web.WebAppApiImpl
@@ -90,8 +90,8 @@ public class TestIndexBlock extends BaseMockAppStateTest {
     role1.incRequested()
     role1.incRequested()
     role1.incRequested()
-    role0.noteFailed(false, "")
-    role0.noteFailed(true, "")
+    role0.noteFailed(false, "", ContainerOutcome.Failed)
+    role0.noteFailed(true,  "", ContainerOutcome.Failed)
 
     StringWriter sw = new StringWriter(64);
     PrintWriter pw = new PrintWriter(sw);