You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@slider.apache.org by st...@apache.org on 2015/05/05 00:09:19 UTC
[2/3] incubator-slider git commit: SLIDER-856 pre-emption is not
treated as failure;
special handling for node failures (not treated as role failures), and
container limits exceeded (not treated as node failures). Counters in
RoleStatus and NodeEntry for
SLIDER-856 pre-emption is not treated as failure; special handling for node failures (not treated as role failures), and container limits exceeded (not treated as node failures). Counters in RoleStatus and NodeEntry for this.
Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/0a50c48e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/0a50c48e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/0a50c48e
Branch: refs/heads/develop
Commit: 0a50c48e713a2df27efd226215c9f748e4335708
Parents: 3d61bed
Author: Steve Loughran <st...@apache.org>
Authored: Mon May 4 23:08:32 2015 +0100
Committer: Steve Loughran <st...@apache.org>
Committed: Mon May 4 23:08:32 2015 +0100
----------------------------------------------------------------------
.../apache/slider/api/ClusterDescription.java | 28 +-
.../java/org/apache/slider/api/RoleKeys.java | 17 +-
.../java/org/apache/slider/api/StatusKeys.java | 3 +
.../org/apache/slider/api/proto/Messages.java | 342 +++++++++++++++++--
.../slider/api/proto/RestTypeMarshalling.java | 9 +-
.../slider/api/types/ComponentInformation.java | 14 +-
.../server/appmaster/SliderAppMaster.java | 2 -
.../slider/server/appmaster/state/AppState.java | 55 ++-
.../appmaster/state/ContainerOutcome.java | 61 ++++
.../server/appmaster/state/NodeEntry.java | 36 +-
.../server/appmaster/state/RoleHistory.java | 19 +-
.../server/appmaster/state/RoleStatus.java | 74 +++-
.../src/main/proto/SliderClusterMessages.proto | 5 +-
.../TestMockAppStateContainerFailure.groovy | 132 ++++++-
.../TestRoleHistoryContainerEvents.groovy | 10 +-
...stRoleHistoryFindNodesForNewInstances.groovy | 10 +-
.../model/mock/BaseMockAppStateTest.groovy | 5 +-
.../appmaster/web/view/TestIndexBlock.groovy | 6 +-
18 files changed, 725 insertions(+), 103 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/ClusterDescription.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/ClusterDescription.java b/slider-core/src/main/java/org/apache/slider/api/ClusterDescription.java
index 7db7e7f..025bd32 100644
--- a/slider-core/src/main/java/org/apache/slider/api/ClusterDescription.java
+++ b/slider-core/src/main/java/org/apache/slider/api/ClusterDescription.java
@@ -594,7 +594,7 @@ public class ClusterDescription implements Cloneable {
}
/**
- * Get a role opt; use {@link Integer#decode(String)} so as to take hex
+ * Get an integer role option; use {@link Integer#decode(String)} so as to take hex
* oct and bin values too.
*
* @param role role to get from
@@ -609,6 +609,21 @@ public class ClusterDescription implements Cloneable {
}
/**
+ * Get an integer role option; use {@link Integer#decode(String)} so as to take hex
+ * oct and bin values too.
+ *
+ * @param role role to get from
+ * @param option option name
+ * @param defVal default value
+ * @return parsed value
+ * @throws NumberFormatException if the role could not be parsed.
+ */
+ public long getRoleOptLong(String role, String option, long defVal) {
+ String val = getRoleOpt(role, option, Long.toString(defVal));
+ return Long.decode(val);
+ }
+
+ /**
* Set a role option, creating the role if necessary
* @param role role name
* @param option option name
@@ -630,6 +645,17 @@ public class ClusterDescription implements Cloneable {
}
/**
+ * Set a role option of any object, using its string value.
+ * This works for (Boxed) numeric values as well as other objects
+ * @param role role name
+ * @param option option name
+ * @param val non-null value
+ */
+ public void setRoleOpt(String role, String option, Object val) {
+ setRoleOpt(role, option, val.toString());
+ }
+
+ /**
* Get the value of a role requirement (cores, RAM, etc).
* These are returned as integers, but there is special handling of the
* string {@link ResourceKeys#YARN_RESOURCE_MAX}, which triggers
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/RoleKeys.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/RoleKeys.java b/slider-core/src/main/java/org/apache/slider/api/RoleKeys.java
index 0f0fb8c..4512354 100644
--- a/slider-core/src/main/java/org/apache/slider/api/RoleKeys.java
+++ b/slider-core/src/main/java/org/apache/slider/api/RoleKeys.java
@@ -45,11 +45,26 @@ public interface RoleKeys {
String ROLE_RELEASING_INSTANCES = "role.releasing.instances";
/**
- * Status report: number currently being released: {@value}
+ * Status report: total number that have failed: {@value}
*/
String ROLE_FAILED_INSTANCES = "role.failed.instances";
/**
+ * Status report: number that have failed recently: {@value}
+ */
+ String ROLE_FAILED_RECENTLY_INSTANCES = "role.failed.recently.instances";
+
+ /**
+ * Status report: number that have failed for node-related issues: {@value}
+ */
+ String ROLE_NODE_FAILED_INSTANCES = "role.failed.node.instances";
+
+ /**
+ * Status report: number that been pre-empted: {@value}
+ */
+ String ROLE_PREEMPTED_INSTANCES = "role.failed.preempted.instances";
+
+ /**
* Status report: number currently being released: {@value}
*/
String ROLE_FAILED_STARTING_INSTANCES = "role.failed.starting.instances";
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/StatusKeys.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/StatusKeys.java b/slider-core/src/main/java/org/apache/slider/api/StatusKeys.java
index ef68aad..6a0a2fa 100644
--- a/slider-core/src/main/java/org/apache/slider/api/StatusKeys.java
+++ b/slider-core/src/main/java/org/apache/slider/api/StatusKeys.java
@@ -27,6 +27,9 @@ public interface StatusKeys {
String STATISTICS_CONTAINERS_COMPLETED = "containers.completed";
String STATISTICS_CONTAINERS_DESIRED = "containers.desired";
String STATISTICS_CONTAINERS_FAILED = "containers.failed";
+ String STATISTICS_CONTAINERS_FAILED_RECENTLY = "containers.failed.recently";
+ String STATISTICS_CONTAINERS_FAILED_NODE = "containers.failed.node";
+ String STATISTICS_CONTAINERS_PREEMPTED = "containers.failed.preempted";
String STATISTICS_CONTAINERS_LIVE = "containers.live";
String STATISTICS_CONTAINERS_REQUESTED = "containers.requested";
String STATISTICS_CONTAINERS_STARTED = "containers.start.started";
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/proto/Messages.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/proto/Messages.java b/slider-core/src/main/java/org/apache/slider/api/proto/Messages.java
index a6c8da8..6c92f46 100644
--- a/slider-core/src/main/java/org/apache/slider/api/proto/Messages.java
+++ b/slider-core/src/main/java/org/apache/slider/api/proto/Messages.java
@@ -15070,6 +15070,36 @@ public final class Messages {
*/
com.google.protobuf.ByteString
getContainersBytes(int index);
+
+ // optional int32 failedRecently = 15;
+ /**
+ * <code>optional int32 failedRecently = 15;</code>
+ */
+ boolean hasFailedRecently();
+ /**
+ * <code>optional int32 failedRecently = 15;</code>
+ */
+ int getFailedRecently();
+
+ // optional int32 nodeFailed = 16;
+ /**
+ * <code>optional int32 nodeFailed = 16;</code>
+ */
+ boolean hasNodeFailed();
+ /**
+ * <code>optional int32 nodeFailed = 16;</code>
+ */
+ int getNodeFailed();
+
+ // optional int32 preempted = 17;
+ /**
+ * <code>optional int32 preempted = 17;</code>
+ */
+ boolean hasPreempted();
+ /**
+ * <code>optional int32 preempted = 17;</code>
+ */
+ int getPreempted();
}
/**
* Protobuf type {@code org.apache.slider.api.ComponentInformationProto}
@@ -15200,6 +15230,21 @@ public final class Messages {
containers_.add(input.readBytes());
break;
}
+ case 120: {
+ bitField0_ |= 0x00002000;
+ failedRecently_ = input.readInt32();
+ break;
+ }
+ case 128: {
+ bitField0_ |= 0x00004000;
+ nodeFailed_ = input.readInt32();
+ break;
+ }
+ case 136: {
+ bitField0_ |= 0x00008000;
+ preempted_ = input.readInt32();
+ break;
+ }
}
}
} catch (com.google.protobuf.InvalidProtocolBufferException e) {
@@ -15535,6 +15580,54 @@ public final class Messages {
return containers_.getByteString(index);
}
+ // optional int32 failedRecently = 15;
+ public static final int FAILEDRECENTLY_FIELD_NUMBER = 15;
+ private int failedRecently_;
+ /**
+ * <code>optional int32 failedRecently = 15;</code>
+ */
+ public boolean hasFailedRecently() {
+ return ((bitField0_ & 0x00002000) == 0x00002000);
+ }
+ /**
+ * <code>optional int32 failedRecently = 15;</code>
+ */
+ public int getFailedRecently() {
+ return failedRecently_;
+ }
+
+ // optional int32 nodeFailed = 16;
+ public static final int NODEFAILED_FIELD_NUMBER = 16;
+ private int nodeFailed_;
+ /**
+ * <code>optional int32 nodeFailed = 16;</code>
+ */
+ public boolean hasNodeFailed() {
+ return ((bitField0_ & 0x00004000) == 0x00004000);
+ }
+ /**
+ * <code>optional int32 nodeFailed = 16;</code>
+ */
+ public int getNodeFailed() {
+ return nodeFailed_;
+ }
+
+ // optional int32 preempted = 17;
+ public static final int PREEMPTED_FIELD_NUMBER = 17;
+ private int preempted_;
+ /**
+ * <code>optional int32 preempted = 17;</code>
+ */
+ public boolean hasPreempted() {
+ return ((bitField0_ & 0x00008000) == 0x00008000);
+ }
+ /**
+ * <code>optional int32 preempted = 17;</code>
+ */
+ public int getPreempted() {
+ return preempted_;
+ }
+
private void initFields() {
name_ = "";
priority_ = 0;
@@ -15550,6 +15643,9 @@ public final class Messages {
failureMessage_ = "";
placementPolicy_ = 0;
containers_ = com.google.protobuf.LazyStringArrayList.EMPTY;
+ failedRecently_ = 0;
+ nodeFailed_ = 0;
+ preempted_ = 0;
}
private byte memoizedIsInitialized = -1;
public final boolean isInitialized() {
@@ -15605,6 +15701,15 @@ public final class Messages {
for (int i = 0; i < containers_.size(); i++) {
output.writeBytes(14, containers_.getByteString(i));
}
+ if (((bitField0_ & 0x00002000) == 0x00002000)) {
+ output.writeInt32(15, failedRecently_);
+ }
+ if (((bitField0_ & 0x00004000) == 0x00004000)) {
+ output.writeInt32(16, nodeFailed_);
+ }
+ if (((bitField0_ & 0x00008000) == 0x00008000)) {
+ output.writeInt32(17, preempted_);
+ }
getUnknownFields().writeTo(output);
}
@@ -15675,6 +15780,18 @@ public final class Messages {
size += dataSize;
size += 1 * getContainersList().size();
}
+ if (((bitField0_ & 0x00002000) == 0x00002000)) {
+ size += com.google.protobuf.CodedOutputStream
+ .computeInt32Size(15, failedRecently_);
+ }
+ if (((bitField0_ & 0x00004000) == 0x00004000)) {
+ size += com.google.protobuf.CodedOutputStream
+ .computeInt32Size(16, nodeFailed_);
+ }
+ if (((bitField0_ & 0x00008000) == 0x00008000)) {
+ size += com.google.protobuf.CodedOutputStream
+ .computeInt32Size(17, preempted_);
+ }
size += getUnknownFields().getSerializedSize();
memoizedSerializedSize = size;
return size;
@@ -15765,6 +15882,21 @@ public final class Messages {
}
result = result && getContainersList()
.equals(other.getContainersList());
+ result = result && (hasFailedRecently() == other.hasFailedRecently());
+ if (hasFailedRecently()) {
+ result = result && (getFailedRecently()
+ == other.getFailedRecently());
+ }
+ result = result && (hasNodeFailed() == other.hasNodeFailed());
+ if (hasNodeFailed()) {
+ result = result && (getNodeFailed()
+ == other.getNodeFailed());
+ }
+ result = result && (hasPreempted() == other.hasPreempted());
+ if (hasPreempted()) {
+ result = result && (getPreempted()
+ == other.getPreempted());
+ }
result = result &&
getUnknownFields().equals(other.getUnknownFields());
return result;
@@ -15834,6 +15966,18 @@ public final class Messages {
hash = (37 * hash) + CONTAINERS_FIELD_NUMBER;
hash = (53 * hash) + getContainersList().hashCode();
}
+ if (hasFailedRecently()) {
+ hash = (37 * hash) + FAILEDRECENTLY_FIELD_NUMBER;
+ hash = (53 * hash) + getFailedRecently();
+ }
+ if (hasNodeFailed()) {
+ hash = (37 * hash) + NODEFAILED_FIELD_NUMBER;
+ hash = (53 * hash) + getNodeFailed();
+ }
+ if (hasPreempted()) {
+ hash = (37 * hash) + PREEMPTED_FIELD_NUMBER;
+ hash = (53 * hash) + getPreempted();
+ }
hash = (29 * hash) + getUnknownFields().hashCode();
memoizedHashCode = hash;
return hash;
@@ -15976,6 +16120,12 @@ public final class Messages {
bitField0_ = (bitField0_ & ~0x00001000);
containers_ = com.google.protobuf.LazyStringArrayList.EMPTY;
bitField0_ = (bitField0_ & ~0x00002000);
+ failedRecently_ = 0;
+ bitField0_ = (bitField0_ & ~0x00004000);
+ nodeFailed_ = 0;
+ bitField0_ = (bitField0_ & ~0x00008000);
+ preempted_ = 0;
+ bitField0_ = (bitField0_ & ~0x00010000);
return this;
}
@@ -16062,6 +16212,18 @@ public final class Messages {
bitField0_ = (bitField0_ & ~0x00002000);
}
result.containers_ = containers_;
+ if (((from_bitField0_ & 0x00004000) == 0x00004000)) {
+ to_bitField0_ |= 0x00002000;
+ }
+ result.failedRecently_ = failedRecently_;
+ if (((from_bitField0_ & 0x00008000) == 0x00008000)) {
+ to_bitField0_ |= 0x00004000;
+ }
+ result.nodeFailed_ = nodeFailed_;
+ if (((from_bitField0_ & 0x00010000) == 0x00010000)) {
+ to_bitField0_ |= 0x00008000;
+ }
+ result.preempted_ = preempted_;
result.bitField0_ = to_bitField0_;
onBuilt();
return result;
@@ -16131,6 +16293,15 @@ public final class Messages {
}
onChanged();
}
+ if (other.hasFailedRecently()) {
+ setFailedRecently(other.getFailedRecently());
+ }
+ if (other.hasNodeFailed()) {
+ setNodeFailed(other.getNodeFailed());
+ }
+ if (other.hasPreempted()) {
+ setPreempted(other.getPreempted());
+ }
this.mergeUnknownFields(other.getUnknownFields());
return this;
}
@@ -16762,6 +16933,105 @@ public final class Messages {
return this;
}
+ // optional int32 failedRecently = 15;
+ private int failedRecently_ ;
+ /**
+ * <code>optional int32 failedRecently = 15;</code>
+ */
+ public boolean hasFailedRecently() {
+ return ((bitField0_ & 0x00004000) == 0x00004000);
+ }
+ /**
+ * <code>optional int32 failedRecently = 15;</code>
+ */
+ public int getFailedRecently() {
+ return failedRecently_;
+ }
+ /**
+ * <code>optional int32 failedRecently = 15;</code>
+ */
+ public Builder setFailedRecently(int value) {
+ bitField0_ |= 0x00004000;
+ failedRecently_ = value;
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>optional int32 failedRecently = 15;</code>
+ */
+ public Builder clearFailedRecently() {
+ bitField0_ = (bitField0_ & ~0x00004000);
+ failedRecently_ = 0;
+ onChanged();
+ return this;
+ }
+
+ // optional int32 nodeFailed = 16;
+ private int nodeFailed_ ;
+ /**
+ * <code>optional int32 nodeFailed = 16;</code>
+ */
+ public boolean hasNodeFailed() {
+ return ((bitField0_ & 0x00008000) == 0x00008000);
+ }
+ /**
+ * <code>optional int32 nodeFailed = 16;</code>
+ */
+ public int getNodeFailed() {
+ return nodeFailed_;
+ }
+ /**
+ * <code>optional int32 nodeFailed = 16;</code>
+ */
+ public Builder setNodeFailed(int value) {
+ bitField0_ |= 0x00008000;
+ nodeFailed_ = value;
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>optional int32 nodeFailed = 16;</code>
+ */
+ public Builder clearNodeFailed() {
+ bitField0_ = (bitField0_ & ~0x00008000);
+ nodeFailed_ = 0;
+ onChanged();
+ return this;
+ }
+
+ // optional int32 preempted = 17;
+ private int preempted_ ;
+ /**
+ * <code>optional int32 preempted = 17;</code>
+ */
+ public boolean hasPreempted() {
+ return ((bitField0_ & 0x00010000) == 0x00010000);
+ }
+ /**
+ * <code>optional int32 preempted = 17;</code>
+ */
+ public int getPreempted() {
+ return preempted_;
+ }
+ /**
+ * <code>optional int32 preempted = 17;</code>
+ */
+ public Builder setPreempted(int value) {
+ bitField0_ |= 0x00010000;
+ preempted_ = value;
+ onChanged();
+ return this;
+ }
+ /**
+ * <code>optional int32 preempted = 17;</code>
+ */
+ public Builder clearPreempted() {
+ bitField0_ = (bitField0_ & ~0x00010000);
+ preempted_ = 0;
+ onChanged();
+ return this;
+ }
+
// @@protoc_insertion_point(builder_scope:org.apache.slider.api.ComponentInformationProto)
}
@@ -28687,46 +28957,48 @@ public final class Messages {
" \002(\t\022\023\n\013application\030\003 \002(\t\"`\n#Application" +
"LivenessInformationProto\022\034\n\024allRequestsS" +
"atisfied\030\001 \001(\010\022\033\n\023requestsOutstanding\030\002 " +
- "\001(\005\"\250\002\n\031ComponentInformationProto\022\014\n\004nam",
+ "\001(\005\"\347\002\n\031ComponentInformationProto\022\014\n\004nam",
"e\030\001 \001(\t\022\020\n\010priority\030\002 \001(\005\022\017\n\007desired\030\003 \001" +
"(\005\022\016\n\006actual\030\004 \001(\005\022\021\n\treleasing\030\005 \001(\005\022\021\n" +
"\trequested\030\006 \001(\005\022\016\n\006failed\030\007 \001(\005\022\017\n\007star" +
"ted\030\010 \001(\005\022\023\n\013startFailed\030\t \001(\005\022\021\n\tcomple" +
"ted\030\n \001(\005\022\026\n\016totalRequested\030\013 \001(\005\022\026\n\016fai" +
"lureMessage\030\014 \001(\t\022\027\n\017placementPolicy\030\r \001" +
- "(\005\022\022\n\ncontainers\030\016 \003(\t\"\210\002\n\031ContainerInfo" +
- "rmationProto\022\023\n\013containerId\030\001 \001(\t\022\021\n\tcom" +
- "ponent\030\002 \001(\t\022\020\n\010released\030\003 \001(\010\022\r\n\005state\030" +
- "\004 \001(\005\022\020\n\010exitCode\030\005 \001(\005\022\023\n\013diagnostics\030\006",
- " \001(\t\022\022\n\ncreateTime\030\007 \001(\003\022\021\n\tstartTime\030\010 " +
- "\001(\003\022\016\n\006output\030\t \003(\t\022\014\n\004host\030\n \001(\t\022\017\n\007hos" +
- "tURL\030\013 \001(\t\022\021\n\tplacement\030\014 \001(\t\022\022\n\nappVers" +
- "ion\030\r \001(\t\"N\n\024PingInformationProto\022\014\n\004tex" +
- "t\030\001 \001(\t\022\014\n\004verb\030\002 \001(\t\022\014\n\004body\030\003 \001(\t\022\014\n\004t" +
- "ime\030\004 \001(\003\"\026\n\024GetModelRequestProto\"\035\n\033Get" +
- "ModelDesiredRequestProto\"$\n\"GetModelDesi" +
- "redAppconfRequestProto\"&\n$GetModelDesire" +
- "dResourcesRequestProto\"%\n#GetModelResolv" +
- "edAppconfRequestProto\"\'\n%GetModelResolve",
- "dResourcesRequestProto\"#\n!GetModelLiveRe" +
- "sourcesRequestProto\"\037\n\035GetLiveContainers" +
- "RequestProto\"u\n\036GetLiveContainersRespons" +
- "eProto\022\r\n\005names\030\001 \003(\t\022D\n\ncontainers\030\002 \003(" +
- "\01320.org.apache.slider.api.ContainerInfor" +
- "mationProto\"3\n\034GetLiveContainerRequestPr" +
- "oto\022\023\n\013containerId\030\001 \002(\t\"\037\n\035GetLiveCompo" +
- "nentsRequestProto\"u\n\036GetLiveComponentsRe" +
- "sponseProto\022\r\n\005names\030\001 \003(\t\022D\n\ncomponents" +
- "\030\002 \003(\01320.org.apache.slider.api.Component",
- "InformationProto\",\n\034GetLiveComponentRequ" +
- "estProto\022\014\n\004name\030\001 \002(\t\"$\n\"GetApplication" +
- "LivenessRequestProto\"\023\n\021EmptyPayloadProt" +
- "o\" \n\020WrappedJsonProto\022\014\n\004json\030\001 \002(\t\"h\n\037G" +
- "etCertificateStoreRequestProto\022\020\n\010hostna" +
- "me\030\001 \001(\t\022\023\n\013requesterId\030\002 \002(\t\022\020\n\010passwor" +
- "d\030\003 \002(\t\022\014\n\004type\030\004 \002(\t\"1\n GetCertificateS" +
- "toreResponseProto\022\r\n\005store\030\001 \002(\014B-\n\033org." +
- "apache.slider.api.protoB\010Messages\210\001\001\240\001\001"
+ "(\005\022\022\n\ncontainers\030\016 \003(\t\022\026\n\016failedRecently" +
+ "\030\017 \001(\005\022\022\n\nnodeFailed\030\020 \001(\005\022\021\n\tpreempted\030" +
+ "\021 \001(\005\"\210\002\n\031ContainerInformationProto\022\023\n\013c" +
+ "ontainerId\030\001 \001(\t\022\021\n\tcomponent\030\002 \001(\t\022\020\n\010r",
+ "eleased\030\003 \001(\010\022\r\n\005state\030\004 \001(\005\022\020\n\010exitCode" +
+ "\030\005 \001(\005\022\023\n\013diagnostics\030\006 \001(\t\022\022\n\ncreateTim" +
+ "e\030\007 \001(\003\022\021\n\tstartTime\030\010 \001(\003\022\016\n\006output\030\t \003" +
+ "(\t\022\014\n\004host\030\n \001(\t\022\017\n\007hostURL\030\013 \001(\t\022\021\n\tpla" +
+ "cement\030\014 \001(\t\022\022\n\nappVersion\030\r \001(\t\"N\n\024Ping" +
+ "InformationProto\022\014\n\004text\030\001 \001(\t\022\014\n\004verb\030\002" +
+ " \001(\t\022\014\n\004body\030\003 \001(\t\022\014\n\004time\030\004 \001(\003\"\026\n\024GetM" +
+ "odelRequestProto\"\035\n\033GetModelDesiredReque" +
+ "stProto\"$\n\"GetModelDesiredAppconfRequest" +
+ "Proto\"&\n$GetModelDesiredResourcesRequest",
+ "Proto\"%\n#GetModelResolvedAppconfRequestP" +
+ "roto\"\'\n%GetModelResolvedResourcesRequest" +
+ "Proto\"#\n!GetModelLiveResourcesRequestPro" +
+ "to\"\037\n\035GetLiveContainersRequestProto\"u\n\036G" +
+ "etLiveContainersResponseProto\022\r\n\005names\030\001" +
+ " \003(\t\022D\n\ncontainers\030\002 \003(\01320.org.apache.sl" +
+ "ider.api.ContainerInformationProto\"3\n\034Ge" +
+ "tLiveContainerRequestProto\022\023\n\013containerI" +
+ "d\030\001 \002(\t\"\037\n\035GetLiveComponentsRequestProto" +
+ "\"u\n\036GetLiveComponentsResponseProto\022\r\n\005na",
+ "mes\030\001 \003(\t\022D\n\ncomponents\030\002 \003(\01320.org.apac" +
+ "he.slider.api.ComponentInformationProto\"" +
+ ",\n\034GetLiveComponentRequestProto\022\014\n\004name\030" +
+ "\001 \002(\t\"$\n\"GetApplicationLivenessRequestPr" +
+ "oto\"\023\n\021EmptyPayloadProto\" \n\020WrappedJsonP" +
+ "roto\022\014\n\004json\030\001 \002(\t\"h\n\037GetCertificateStor" +
+ "eRequestProto\022\020\n\010hostname\030\001 \001(\t\022\023\n\013reque" +
+ "sterId\030\002 \002(\t\022\020\n\010password\030\003 \002(\t\022\014\n\004type\030\004" +
+ " \002(\t\"1\n GetCertificateStoreResponseProto" +
+ "\022\r\n\005store\030\001 \002(\014B-\n\033org.apache.slider.api",
+ ".protoB\010Messages\210\001\001\240\001\001"
};
com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner =
new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() {
@@ -28882,7 +29154,7 @@ public final class Messages {
internal_static_org_apache_slider_api_ComponentInformationProto_fieldAccessorTable = new
com.google.protobuf.GeneratedMessage.FieldAccessorTable(
internal_static_org_apache_slider_api_ComponentInformationProto_descriptor,
- new java.lang.String[] { "Name", "Priority", "Desired", "Actual", "Releasing", "Requested", "Failed", "Started", "StartFailed", "Completed", "TotalRequested", "FailureMessage", "PlacementPolicy", "Containers", });
+ new java.lang.String[] { "Name", "Priority", "Desired", "Actual", "Releasing", "Requested", "Failed", "Started", "StartFailed", "Completed", "TotalRequested", "FailureMessage", "PlacementPolicy", "Containers", "FailedRecently", "NodeFailed", "Preempted", });
internal_static_org_apache_slider_api_ContainerInformationProto_descriptor =
getDescriptor().getMessageTypes().get(25);
internal_static_org_apache_slider_api_ContainerInformationProto_fieldAccessorTable = new
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/proto/RestTypeMarshalling.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/proto/RestTypeMarshalling.java b/slider-core/src/main/java/org/apache/slider/api/proto/RestTypeMarshalling.java
index f005bc3..c408ed2 100644
--- a/slider-core/src/main/java/org/apache/slider/api/proto/RestTypeMarshalling.java
+++ b/slider-core/src/main/java/org/apache/slider/api/proto/RestTypeMarshalling.java
@@ -78,7 +78,7 @@ public class RestTypeMarshalling {
info.started = wire.getStarted();
info.startFailed = wire.getStartFailed();
info.totalRequested = wire.getTotalRequested();
- info.containers = new ArrayList<String>(wire.getContainersList());
+ info.containers = new ArrayList<>(wire.getContainersList());
if (wire.hasFailureMessage()) {
info.failureMessage = wire.getFailureMessage();
}
@@ -95,6 +95,7 @@ public class RestTypeMarshalling {
return builder.build();
}
+ @SuppressWarnings("IOResourceOpenedButNotSafelyClosed")
private static byte[] getStoreBytes(SecurityStore securityStore)
throws IOException {
InputStream is = new FileInputStream(securityStore.getFile());
@@ -105,8 +106,7 @@ public class RestTypeMarshalling {
return response.getStore().toByteArray();
}
- public static Messages.ComponentInformationProto
- marshall(ComponentInformation info) {
+ public static Messages.ComponentInformationProto marshall(ComponentInformation info) {
Messages.ComponentInformationProto.Builder builder =
Messages.ComponentInformationProto.newBuilder();
@@ -123,6 +123,9 @@ public class RestTypeMarshalling {
builder.setStarted(info.started);
builder.setStartFailed(info.startFailed);
builder.setTotalRequested(info.totalRequested);
+ builder.setNodeFailed(info.nodeFailed);
+ builder.setPreempted(info.preempted);
+ builder.setFailedRecently(info.failedRecently);
if (info.failureMessage != null) {
builder.setFailureMessage(info.failureMessage);
}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/api/types/ComponentInformation.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/types/ComponentInformation.java b/slider-core/src/main/java/org/apache/slider/api/types/ComponentInformation.java
index 286c3a1..52f6c08 100644
--- a/slider-core/src/main/java/org/apache/slider/api/types/ComponentInformation.java
+++ b/slider-core/src/main/java/org/apache/slider/api/types/ComponentInformation.java
@@ -28,6 +28,14 @@ import java.util.Map;
/**
* Serializable version of component data.
+ * <p>
+ * This is sent in REST calls as a JSON object —but is also marshalled into
+ * a protobuf structure. Look at {@link org.apache.slider.api.proto.RestTypeMarshalling}
+ * for the specifics there.
+ * <p>
+ * This means that if any fields are added here. they must be added to
+ * <code>src/main/proto/SliderClusterMessages.proto</code> and
+ * the probuf structures rebuilt.
*/
@JsonIgnoreProperties(ignoreUnknown = true)
@JsonSerialize(include = JsonSerialize.Inclusion.NON_NULL)
@@ -40,6 +48,7 @@ public class ComponentInformation {
public int placementPolicy;
public int requested;
public int failed, started, startFailed, completed, totalRequested;
+ public int nodeFailed, failedRecently, preempted;
public String failureMessage;
public List<String> containers;
@@ -48,7 +57,7 @@ public class ComponentInformation {
* @return a map for use in statistics reports
*/
public Map<String, Integer> buildStatistics() {
- Map<String, Integer> stats = new HashMap<String, Integer>();
+ Map<String, Integer> stats = new HashMap<>();
stats.put(StatusKeys.STATISTICS_CONTAINERS_ACTIVE_REQUESTS, requested);
stats.put(StatusKeys.STATISTICS_CONTAINERS_COMPLETED, completed);
stats.put(StatusKeys.STATISTICS_CONTAINERS_DESIRED, desired);
@@ -57,6 +66,9 @@ public class ComponentInformation {
stats.put(StatusKeys.STATISTICS_CONTAINERS_REQUESTED, totalRequested);
stats.put(StatusKeys.STATISTICS_CONTAINERS_STARTED, started);
stats.put(StatusKeys.STATISTICS_CONTAINERS_START_FAILED, startFailed);
+ stats.put(StatusKeys.STATISTICS_CONTAINERS_FAILED_RECENTLY, failedRecently);
+ stats.put(StatusKeys.STATISTICS_CONTAINERS_FAILED_NODE, nodeFailed);
+ stats.put(StatusKeys.STATISTICS_CONTAINERS_PREEMPTED, preempted);
return stats;
}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
index 3ab4501..7da627e 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
@@ -164,9 +164,7 @@ import org.apache.slider.server.services.yarnregistry.YarnRegistryViewForProvide
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.BufferedReader;
import java.io.File;
-import java.io.FileReader;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.URI;
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
index 5428a34..18eb578 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
@@ -93,6 +93,9 @@ import static org.apache.slider.api.ResourceKeys.YARN_LABEL_EXPRESSION;
import static org.apache.slider.api.ResourceKeys.YARN_MEMORY;
import static org.apache.slider.api.RoleKeys.ROLE_FAILED_INSTANCES;
import static org.apache.slider.api.RoleKeys.ROLE_FAILED_STARTING_INSTANCES;
+import static org.apache.slider.api.RoleKeys.ROLE_FAILED_RECENTLY_INSTANCES;
+import static org.apache.slider.api.RoleKeys.ROLE_NODE_FAILED_INSTANCES;
+import static org.apache.slider.api.RoleKeys.ROLE_PREEMPTED_INSTANCES;
import static org.apache.slider.api.RoleKeys.ROLE_RELEASING_INSTANCES;
import static org.apache.slider.api.RoleKeys.ROLE_REQUESTED_INSTANCES;
import static org.apache.slider.api.StateValues.STATE_CREATED;
@@ -1430,7 +1433,7 @@ public class AppState {
text = "container start failure";
}
instance.diagnostics = text;
- roleStatus.noteFailed(true, null);
+ roleStatus.noteFailed(true, text, ContainerOutcome.Failed);
getFailedNodes().put(containerId, instance);
roleHistory.onNodeManagerContainerStartFailed(instance.container);
}
@@ -1477,7 +1480,11 @@ public class AppState {
public static class NodeCompletionResult {
public boolean surplusNode = false;
public RoleInstance roleInstance;
- public boolean containerFailed;
+ // did the container fail for *any* reason?
+ public boolean containerFailed = false;
+ // detailed outcome on the container failure
+ public ContainerOutcome outcome = ContainerOutcome.Completed;
+ public int exitStatus = 0;
public boolean unknownNode = false;
@@ -1486,7 +1493,9 @@ public class AppState {
new StringBuilder("NodeCompletionResult{");
sb.append("surplusNode=").append(surplusNode);
sb.append(", roleInstance=").append(roleInstance);
+ sb.append(", exitStatus=").append(exitStatus);
sb.append(", containerFailed=").append(containerFailed);
+ sb.append(", outcome=").append(outcome);
sb.append(", unknownNode=").append(unknownNode);
sb.append('}');
return sb.toString();
@@ -1504,6 +1513,8 @@ public class AppState {
NodeCompletionResult result = new NodeCompletionResult();
RoleInstance roleInstance;
+ int exitStatus = status.getExitStatus();
+ result.exitStatus = exitStatus;
if (containersBeingReleased.containsKey(containerId)) {
log.info("Container was queued for release : {}", containerId);
Container container = containersBeingReleased.remove(containerId);
@@ -1516,14 +1527,18 @@ public class AppState {
actual,
releasing,
completedCount);
+ result.outcome = ContainerOutcome.Completed;
roleHistory.onReleaseCompleted(container);
} else if (surplusNodes.remove(containerId)) {
//its a surplus one being purged
result.surplusNode = true;
} else {
- //a container has failed
+ // a container has failed or been killed
+ // use the exit code to determine the outcome
result.containerFailed = true;
+ result.outcome = ContainerOutcome.fromExitStatus(exitStatus);
+
roleInstance = removeOwnedContainer(containerId);
if (roleInstance != null) {
//it was active, move it to failed
@@ -1544,32 +1559,34 @@ public class AppState {
boolean shortLived = isShortLived(roleInstance);
String message;
Container failedContainer = roleInstance.container;
-
+
//build the failure message
if (failedContainer != null) {
String completedLogsUrl = getLogsURLForContainer(failedContainer);
- message = String.format("Failure %s on host %s: %s",
+ message = String.format("Failure %s on host %s (%d): %s",
roleInstance.getContainerId().toString(),
failedContainer.getNodeId().getHost(),
+ exitStatus,
completedLogsUrl);
} else {
- message = String.format("Failure %s", containerId);
+ message = String.format("Failure %s (%d)", containerId, exitStatus);
}
- int failed = roleStatus.noteFailed(shortLived, message);
+ roleStatus.noteFailed(shortLived, message, result.outcome);
+ long failed = roleStatus.getFailed();
log.info("Current count of failed role[{}] {} = {}",
roleId, rolename, failed);
if (failedContainer != null) {
- roleHistory.onFailedContainer(failedContainer, shortLived);
+ roleHistory.onFailedContainer(failedContainer, shortLived, result.outcome);
}
-
+
} catch (YarnRuntimeException e1) {
log.error("Failed container of unknown role {}", roleId);
}
} else {
//this isn't a known container.
-
+
log.error("Notified of completed container {} that is not in the list" +
- " of active or failed containers", containerId);
+ " of active or failed containers", containerId);
completionOfUnknownContainerEvent.incrementAndGet();
result.unknownNode = true;
}
@@ -1587,7 +1604,7 @@ public class AppState {
RoleInstance node = getLiveNodes().remove(id);
if (node != null) {
node.state = STATE_DESTROYED;
- node.exitCode = status.getExitStatus();
+ node.exitCode = exitStatus;
node.diagnostics = status.getDiagnostics();
getCompletedNodes().put(id, node);
result.roleInstance = node;
@@ -1595,7 +1612,6 @@ public class AppState {
// not in the list
log.warn("Received notification of completion of unknown node {}", id);
completionOfNodeNotInLiveListEvent.incrementAndGet();
-
}
// and the active node list if present
@@ -1709,6 +1725,9 @@ public class AppState {
cd.setRoleOpt(rolename, ROLE_RELEASING_INSTANCES, role.getReleasing());
cd.setRoleOpt(rolename, ROLE_FAILED_INSTANCES, role.getFailed());
cd.setRoleOpt(rolename, ROLE_FAILED_STARTING_INSTANCES, role.getStartFailed());
+ cd.setRoleOpt(rolename, ROLE_FAILED_RECENTLY_INSTANCES, role.getFailedRecently());
+ cd.setRoleOpt(rolename, ROLE_NODE_FAILED_INSTANCES, role.getNodeFailed());
+ cd.setRoleOpt(rolename, ROLE_PREEMPTED_INSTANCES, role.getPreempted());
Map<String, Integer> stats = role.buildStatistics();
cd.statistics.put(rolename, stats);
}
@@ -1798,14 +1817,14 @@ public class AppState {
}
/**
- * Check the failure threshold for a role
+ * Check the "recent" failure threshold for a role
* @param role role to examine
* @throws TriggerClusterTeardownException if the role
* has failed too many times
*/
private void checkFailureThreshold(RoleStatus role)
throws TriggerClusterTeardownException {
- int failures = role.getFailed();
+ long failures = role.getFailedRecently();
int threshold = getFailureThresholdForRole(role);
log.debug("Failure count of component: {}: {}, threshold={}",
role.getName(), failures, threshold);
@@ -1814,7 +1833,7 @@ public class AppState {
throw new TriggerClusterTeardownException(
SliderExitCodes.EXIT_DEPLOYMENT_FAILED,
FinalApplicationStatus.FAILED, ErrorStrings.E_UNSTABLE_CLUSTER +
- " - failed with component %s failing %d times (%d in startup);" +
+ " - failed with component %s failed 'recently' %d times (%d in startup);" +
" threshold is %d - last failure: %s",
role.getName(),
role.getFailed(),
@@ -1853,11 +1872,11 @@ public class AppState {
}
/**
- * Reset the failure counts of all roles
+ * Reset the "recent" failure counts of all roles
*/
public void resetFailureCounts() {
for (RoleStatus roleStatus : getRoleStatusMap().values()) {
- int failed = roleStatus.resetFailed();
+ long failed = roleStatus.resetFailedRecently();
log.info("Resetting failure count of {}; was {}",
roleStatus.getName(),
failed);
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/state/ContainerOutcome.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/ContainerOutcome.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/ContainerOutcome.java
new file mode 100644
index 0000000..59ab30b
--- /dev/null
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/ContainerOutcome.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.slider.server.appmaster.state;
+
+import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
+
+/**
+ * Container outcomes we care about; slightly simplified from
+ * {@link ContainerExitStatus} -and hopefully able to handle
+ * any new exit codes.
+ */
+public enum ContainerOutcome {
+ Completed,
+ Failed,
+ Failed_limits_exceeded,
+ Node_failure,
+ Preempted;
+
+ /**
+ * Build a container outcome from an exit status.
+ * The values in {@link ContainerExitStatus} are used
+ * here.
+ * @param exitStatus exit status
+ * @return an enumeration of the outcome.
+ */
+ public static ContainerOutcome fromExitStatus(int exitStatus) {
+ switch (exitStatus) {
+ case ContainerExitStatus.ABORTED:
+ case ContainerExitStatus.KILLED_BY_APPMASTER:
+ case ContainerExitStatus.KILLED_BY_RESOURCEMANAGER:
+ case ContainerExitStatus.KILLED_AFTER_APP_COMPLETION:
+ // could either be a release or node failure. Treat as completion
+ return Completed;
+ case ContainerExitStatus.DISKS_FAILED:
+ return Node_failure;
+ case ContainerExitStatus.PREEMPTED:
+ return Preempted;
+ case ContainerExitStatus.KILLED_EXCEEDED_PMEM:
+ case ContainerExitStatus.KILLED_EXCEEDED_VMEM:
+ return Failed_limits_exceeded;
+ default:
+ return exitStatus == 0 ? Completed : Failed;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeEntry.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeEntry.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeEntry.java
index 0aa2d42..0f46054 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeEntry.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeEntry.java
@@ -53,6 +53,7 @@ public class NodeEntry {
private int starting;
private int startFailed;
private int failed;
+ private int preempted;
/**
* Counter of "failed recently" events. These are all failures
* which have happened since it was last reset.
@@ -138,8 +139,7 @@ public class NodeEntry {
public synchronized boolean onStartFailed() {
decStarting();
++startFailed;
- ++failedRecently;
- return containerCompleted(false);
+ return containerCompleted(false, ContainerOutcome.Failed);
}
/**
@@ -183,14 +183,35 @@ public class NodeEntry {
* planned: dec our release count
* unplanned: dec our live count
* @param wasReleased true if this was planned
+ * @param outcome
* @return true if this node is now available
*/
- public synchronized boolean containerCompleted(boolean wasReleased) {
+ public synchronized boolean containerCompleted(boolean wasReleased, ContainerOutcome outcome) {
if (wasReleased) {
releasing = RoleHistoryUtils.decToFloor(releasing);
} else {
- ++failed;
- ++failedRecently;
+ // for the node, we use the outcome of the faiure to decide
+ // whether this is potentially "node-related"
+ switch(outcome) {
+ // general "any reason" app failure
+ case Failed:
+ // specific node failure
+ case Node_failure:
+
+ ++failed;
+ ++failedRecently;
+ break;
+
+ case Preempted:
+ preempted++;
+ break;
+
+ // failures which are node-independent
+ case Failed_limits_exceeded:
+ case Completed:
+ default:
+ break;
+ }
}
decLive();
return isAvailable();
@@ -219,6 +240,10 @@ public class NodeEntry {
return failedRecently;
}
+ public synchronized int getPreempted() {
+ return preempted;
+ }
+
/**
* Reset the failed recently count.
*/
@@ -236,6 +261,7 @@ public class NodeEntry {
sb.append(", releasing=").append(releasing);
sb.append(", lastUsed=").append(lastUsed);
sb.append(", failedRecently=").append(failedRecently);
+ sb.append(", preempted=").append(preempted);
sb.append(", startFailed=").append(startFailed);
sb.append('}');
return sb.toString();
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
index bb482af..926d440 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
@@ -767,7 +767,7 @@ public class RoleHistory {
* @return true if the node was queued
*/
public boolean onNodeManagerContainerStartFailed(Container container) {
- return markContainerFinished(container, false, true);
+ return markContainerFinished(container, false, true, ContainerOutcome.Failed);
}
/**
@@ -808,7 +808,7 @@ public class RoleHistory {
* @return true if the node was queued
*/
public boolean onReleaseCompleted(Container container) {
- return markContainerFinished(container, true, false);
+ return markContainerFinished(container, true, false, ContainerOutcome.Failed);
}
/**
@@ -817,10 +817,13 @@ public class RoleHistory {
*
* @param container completed container
* @param shortLived was the container short lived?
+ * @param outcome
* @return true if the node is considered available for work
*/
- public boolean onFailedContainer(Container container, boolean shortLived) {
- return markContainerFinished(container, false, shortLived);
+ public boolean onFailedContainer(Container container,
+ boolean shortLived,
+ ContainerOutcome outcome) {
+ return markContainerFinished(container, false, shortLived, outcome);
}
/**
@@ -831,11 +834,13 @@ public class RoleHistory {
* @param container completed container
* @param wasReleased was the container released?
* @param shortLived was the container short lived?
+ * @param outcome
* @return true if the node was queued
*/
protected synchronized boolean markContainerFinished(Container container,
- boolean wasReleased,
- boolean shortLived) {
+ boolean wasReleased,
+ boolean shortLived,
+ ContainerOutcome outcome) {
NodeEntry nodeEntry = getOrCreateNodeEntry(container);
log.info("Finished container for node {}, released={}, shortlived={}",
nodeEntry.rolePriority, wasReleased, shortLived);
@@ -844,7 +849,7 @@ public class RoleHistory {
nodeEntry.onStartFailed();
available = false;
} else {
- available = nodeEntry.containerCompleted(wasReleased);
+ available = nodeEntry.containerCompleted(wasReleased, outcome);
maybeQueueNodeForWork(container, nodeEntry, available);
}
touch();
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
index 899948f..e2974fc 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
@@ -25,6 +25,7 @@ import org.apache.slider.providers.ProviderRole;
import java.io.Serializable;
import java.util.Comparator;
import java.util.Map;
+import java.util.concurrent.atomic.AtomicLong;
/**
@@ -43,7 +44,12 @@ public final class RoleStatus implements Cloneable {
private final ProviderRole providerRole;
private int desired, actual, requested, releasing;
- private int failed, started, startFailed, completed, totalRequested;
+ private int failed, startFailed;
+ private int started, completed, totalRequested;
+ private final AtomicLong preempted = new AtomicLong(0);
+ private final AtomicLong nodeFailed = new AtomicLong(0);
+ private final AtomicLong failedRecently = new AtomicLong(0);
+ private final AtomicLong limitsExceeded = new AtomicLong(0);
private String failureMessage = "";
@@ -163,15 +169,20 @@ public final class RoleStatus implements Cloneable {
return failed;
}
+ public synchronized long getFailedRecently() {
+ return failedRecently.get();
+ }
+
/**
- * Reset the failure counts
- * @return the total number of failures up to this point
+ * Reset the recent failure
+ * @return the number of failures in the "recent" window
*/
- public synchronized int resetFailed() {
- int total = failed + startFailed;
- failed = 0;
- startFailed = 0;
- return total;
+ public long resetFailedRecently() {
+ return failedRecently.getAndSet(0);
+ }
+
+ public long getLimitsExceeded() {
+ return limitsExceeded.get();
}
/**
@@ -179,19 +190,37 @@ public final class RoleStatus implements Cloneable {
* be used in any diagnostics if an exception
* is later raised.
* @param startupFailure flag to indicate this was a startup event
- * @return the number of failures
* @param text text about the failure
+ * @param outcome outcome of the container
*/
- public synchronized int noteFailed(boolean startupFailure, String text) {
- int current = ++failed;
+ public synchronized void noteFailed(boolean startupFailure, String text,
+ ContainerOutcome outcome) {
if (text != null) {
failureMessage = text;
}
- //have a look to see if it short lived
- if (startupFailure) {
- incStartFailed();
+ switch (outcome) {
+ case Preempted:
+ preempted.incrementAndGet();
+ break;
+
+ case Node_failure:
+ nodeFailed.incrementAndGet();
+ failed++;
+ break;
+
+ case Failed_limits_exceeded: // exceeded memory or CPU; app/configuration related
+ limitsExceeded.incrementAndGet();
+ // fall through
+ case Failed: // application failure, possibly node related, possibly not
+ default: // anything else (future-proofing)
+ failed++;
+ failedRecently.incrementAndGet();
+ //have a look to see if it short lived
+ if (startupFailure) {
+ incStartFailed();
+ }
+ break;
}
- return current;
}
public synchronized int getStartFailed() {
@@ -229,7 +258,14 @@ public final class RoleStatus implements Cloneable {
return totalRequested;
}
-
+ public long getPreempted() {
+ return preempted.get();
+ }
+
+ public long getNodeFailed() {
+ return nodeFailed.get();
+ }
+
/**
* Get the number of roles we are short of.
* nodes released are ignored.
@@ -267,6 +303,9 @@ public final class RoleStatus implements Cloneable {
", requested=" + requested +
", releasing=" + releasing +
", failed=" + failed +
+ ", failed recently=" + failedRecently.get() +
+ ", node failed=" + nodeFailed.get() +
+ ", pre-empted=" + preempted.get() +
", started=" + started +
", startFailed=" + startFailed +
", completed=" + completed +
@@ -313,6 +352,9 @@ public final class RoleStatus implements Cloneable {
info.placementPolicy = getPlacementPolicy();
info.failureMessage = failureMessage;
info.totalRequested = totalRequested;
+ info.failedRecently = failedRecently.intValue();
+ info.nodeFailed = nodeFailed.intValue();
+ info.preempted = preempted.intValue();
return info;
}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/main/proto/SliderClusterMessages.proto
----------------------------------------------------------------------
diff --git a/slider-core/src/main/proto/SliderClusterMessages.proto b/slider-core/src/main/proto/SliderClusterMessages.proto
index 5e770d5..caca87b 100644
--- a/slider-core/src/main/proto/SliderClusterMessages.proto
+++ b/slider-core/src/main/proto/SliderClusterMessages.proto
@@ -246,7 +246,10 @@ message ComponentInformationProto {
optional int32 totalRequested = 11;
optional string failureMessage = 12 ;
optional int32 placementPolicy = 13;
- repeated string containers = 14;
+ repeated string containers = 14;
+ optional int32 failedRecently = 15;
+ optional int32 nodeFailed = 16;
+ optional int32 preempted = 17;
}
/*
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy
index 6368a3d..1b79115 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateContainerFailure.groovy
@@ -30,6 +30,7 @@ import org.apache.slider.server.appmaster.model.mock.BaseMockAppStateTest
import org.apache.slider.server.appmaster.model.mock.MockRoles
import org.apache.slider.server.appmaster.model.mock.MockYarnEngine
import org.apache.slider.server.appmaster.state.AppState
+import org.apache.slider.server.appmaster.state.ContainerOutcome
import org.apache.slider.server.appmaster.state.NodeEntry
import org.apache.slider.server.appmaster.state.NodeInstance
import org.apache.slider.server.appmaster.state.RoleHistory
@@ -87,7 +88,7 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest
assert appState.isShortLived(instance)
AppState.NodeCompletionResult result = appState.onCompletedNode(containerStatus(cid, 1))
assert result.roleInstance != null
- assert result.containerFailed
+ assert result.containerFailed
RoleStatus status = role0Status
assert status.failed == 1
assert status.startFailed == 1
@@ -141,8 +142,8 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest
ContainerId cid = ids[0]
appState.onNodeManagerContainerStartFailed(cid, new SliderException("oops"))
RoleStatus status = role0Status
- assert status.failed == 1
- assert status.startFailed == 1
+ assert 1 == status.failed
+ assert 1 == status.startFailed
RoleHistory history = appState.roleHistory
@@ -182,7 +183,7 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest
@Test
- public void testFailureWindow() throws Throwable {
+ public void testRoleStatusFailureWindow() throws Throwable {
ResetFailureWindow resetter = new ResetFailureWindow();
@@ -209,4 +210,127 @@ class TestMockAppStateContainerFailure extends BaseMockAppStateTest
}
}
+ @Test
+ public void testRoleStatusFailed() throws Throwable {
+ def status = role0Status
+ // limits exceeded
+ status.noteFailed(false, "text",ContainerOutcome.Failed)
+ assert 1 == status.failed
+ assert 1L == status.failedRecently
+ assert 0L == status.limitsExceeded
+ assert 0L == status.preempted
+ assert 0L == status.nodeFailed
+
+ ResetFailureWindow resetter = new ResetFailureWindow();
+ resetter.execute(null, null, appState)
+ assert 1 == status.failed
+ assert 0L == status.failedRecently
+ }
+
+ @Test
+ public void testRoleStatusFailedLimitsExceeded() throws Throwable {
+ def status = role0Status
+ // limits exceeded
+ status.noteFailed(false, "text",ContainerOutcome.Failed_limits_exceeded)
+ assert 1 == status.failed
+ assert 1L == status.failedRecently
+ assert 1L == status.limitsExceeded
+ assert 0L == status.preempted
+ assert 0L == status.nodeFailed
+
+ ResetFailureWindow resetter = new ResetFailureWindow();
+ resetter.execute(null, null, appState)
+ assert 1 == status.failed
+ assert 0L == status.failedRecently
+ assert 1L == status.limitsExceeded
+ }
+
+
+ @Test
+ public void testRoleStatusFailedPrempted() throws Throwable {
+ def status = role0Status
+ // limits exceeded
+ status.noteFailed(false, "text", ContainerOutcome.Preempted)
+ assert 0 == status.failed
+ assert 1L == status.preempted
+ assert 0L == status.failedRecently
+ assert 0L == status.nodeFailed
+
+ ResetFailureWindow resetter = new ResetFailureWindow();
+ resetter.execute(null, null, appState)
+ assert 1L == status.preempted
+ }
+
+
+ @Test
+ public void testRoleStatusFailedNode() throws Throwable {
+ def status = role0Status
+ // limits exceeded
+ status.noteFailed(false, "text", ContainerOutcome.Node_failure)
+ assert 1 == status.failed
+ assert 0L == status.failedRecently
+ assert 0L == status.limitsExceeded
+ assert 0L == status.preempted
+ assert 1L == status.nodeFailed
+ }
+
+ @Test
+ public void testNodeEntryCompleted() throws Throwable {
+ NodeEntry nodeEntry = new NodeEntry(1)
+ nodeEntry.containerCompleted(true, ContainerOutcome.Completed);
+ assert 0 == nodeEntry.failed
+ assert 0 == nodeEntry.failedRecently
+ assert 0 == nodeEntry.startFailed
+ assert 0 == nodeEntry.preempted
+ assert 0 == nodeEntry.active
+ assert nodeEntry.available
+ }
+
+ @Test
+ public void testNodeEntryFailed() throws Throwable {
+ NodeEntry nodeEntry = new NodeEntry(1)
+ nodeEntry.containerCompleted(false, ContainerOutcome.Failed);
+ assert 1 == nodeEntry.failed
+ assert 1 == nodeEntry.failedRecently
+ assert 0 == nodeEntry.startFailed
+ assert 0 == nodeEntry.preempted
+ assert 0 == nodeEntry.active
+ assert nodeEntry.available
+ nodeEntry.resetFailedRecently()
+ assert 1 == nodeEntry.failed
+ assert 0 == nodeEntry.failedRecently
+ }
+
+ @Test
+ public void testNodeEntryLimitsExceeded() throws Throwable {
+ NodeEntry nodeEntry = new NodeEntry(1)
+ nodeEntry.containerCompleted(false, ContainerOutcome.Failed_limits_exceeded);
+ assert 0 == nodeEntry.failed
+ assert 0 == nodeEntry.failedRecently
+ assert 0 == nodeEntry.startFailed
+ assert 0 == nodeEntry.preempted
+ }
+
+ @Test
+ public void testNodeEntryPreempted() throws Throwable {
+ NodeEntry nodeEntry = new NodeEntry(1)
+ nodeEntry.containerCompleted(false, ContainerOutcome.Preempted);
+ assert 0 == nodeEntry.failed
+ assert 0 == nodeEntry.failedRecently
+ assert 0 == nodeEntry.startFailed
+ assert 1 == nodeEntry.preempted
+ }
+
+ @Test
+ public void testNodeEntryNodeFailure() throws Throwable {
+ NodeEntry nodeEntry = new NodeEntry(1)
+ nodeEntry.containerCompleted(false, ContainerOutcome.Node_failure);
+ assert 1 == nodeEntry.failed
+ assert 1 == nodeEntry.failedRecently
+ assert 0 == nodeEntry.startFailed
+ assert 0 == nodeEntry.preempted
+ }
+
+
+
}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
index c7a38f5..4fdf39c 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
@@ -301,7 +301,10 @@ class TestRoleHistoryContainerEvents extends BaseMockAppStateTest {
roleHistory.onContainerStarted(container)
//later, declare that it failed
- roleHistory.onFailedContainer(container, false)
+ roleHistory.onFailedContainer(
+ container,
+ false,
+ ContainerOutcome.Failed)
assert roleEntry.starting == 0
assert roleEntry.available
assert roleEntry.active == 0
@@ -330,7 +333,10 @@ class TestRoleHistoryContainerEvents extends BaseMockAppStateTest {
NodeInstance allocated = nodemap.get(hostname)
NodeEntry roleEntry = allocated.get(role)
assert roleEntry.available
- roleHistory.onFailedContainer(container, false)
+ roleHistory.onFailedContainer(
+ container,
+ false,
+ ContainerOutcome.Failed)
assert roleEntry.starting == 0
assert roleEntry.failed == 1
assert roleEntry.available
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy
index b29a0b5..79d23e5 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryFindNodesForNewInstances.groovy
@@ -23,7 +23,7 @@ import groovy.util.logging.Slf4j
import org.apache.slider.providers.ProviderRole
import org.apache.slider.server.appmaster.model.mock.BaseMockAppStateTest
import org.apache.slider.server.appmaster.model.mock.MockFactory
-import org.apache.slider.server.appmaster.state.NodeEntry
+import org.apache.slider.server.appmaster.state.ContainerOutcome
import org.apache.slider.server.appmaster.state.NodeInstance
import org.apache.slider.server.appmaster.state.RoleHistory
import org.apache.slider.server.appmaster.state.RoleStatus
@@ -132,10 +132,14 @@ class TestRoleHistoryFindNodesForNewInstances extends BaseMockAppStateTest {
// mark age2 and active 0 as busy, expect a null back
def entry0 = age2Active0.get(0)
- entry0.containerCompleted(false)
+ entry0.containerCompleted(
+ false,
+ ContainerOutcome.Failed)
assert entry0.failed
assert entry0.failedRecently
- entry0.containerCompleted(false)
+ entry0.containerCompleted(
+ false,
+ ContainerOutcome.Failed)
assert !age2Active0.exceedsFailureThreshold(roleStat)
// set failure to 1
roleStat.providerRole.nodeFailureThreshold = 1
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy
index 3e5494f..29eefa5 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/BaseMockAppStateTest.groovy
@@ -35,6 +35,7 @@ import org.apache.slider.core.main.LauncherExitCodes
import org.apache.slider.server.appmaster.operations.AbstractRMOperation
import org.apache.slider.server.appmaster.state.AppState
import org.apache.slider.server.appmaster.state.ContainerAssignment
+import org.apache.slider.server.appmaster.state.ContainerOutcome
import org.apache.slider.server.appmaster.state.NodeEntry
import org.apache.slider.server.appmaster.state.NodeInstance
import org.apache.slider.server.appmaster.state.RoleInstance
@@ -344,7 +345,9 @@ abstract class BaseMockAppStateTest extends SliderTestBase implements MockRoles
public NodeEntry recordAsFailed(NodeInstance node, int id, int count) {
def entry = node.getOrCreate(id)
1.upto(count) {
- entry.containerCompleted(false)
+ entry.containerCompleted(
+ false,
+ ContainerOutcome.Failed)
}
entry
}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/0a50c48e/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy
index 9ec230c..c2ea837 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/web/view/TestIndexBlock.groovy
@@ -19,13 +19,13 @@ package org.apache.slider.server.appmaster.web.view
import com.google.inject.AbstractModule
import com.google.inject.Guice
import com.google.inject.Injector
-import groovy.transform.CompileStatic
import groovy.util.logging.Slf4j
import org.apache.hadoop.yarn.api.records.Container
import org.apache.hadoop.yarn.api.records.Priority
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet
import org.apache.slider.providers.ProviderService
import org.apache.slider.server.appmaster.model.mock.*
+import org.apache.slider.server.appmaster.state.ContainerOutcome
import org.apache.slider.server.appmaster.state.ProviderAppState
import org.apache.slider.server.appmaster.web.WebAppApi
import org.apache.slider.server.appmaster.web.WebAppApiImpl
@@ -90,8 +90,8 @@ public class TestIndexBlock extends BaseMockAppStateTest {
role1.incRequested()
role1.incRequested()
role1.incRequested()
- role0.noteFailed(false, "")
- role0.noteFailed(true, "")
+ role0.noteFailed(false, "", ContainerOutcome.Failed)
+ role0.noteFailed(true, "", ContainerOutcome.Failed)
StringWriter sw = new StringWriter(64);
PrintWriter pw = new PrintWriter(sw);