You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@slider.apache.org by sm...@apache.org on 2015/01/31 01:54:47 UTC
incubator-slider git commit: SLIDER-743. Include node failure history
when choosing placement hints
Repository: incubator-slider
Updated Branches:
refs/heads/develop 922439e58 -> 39e04e36e
SLIDER-743. Include node failure history when choosing placement hints
Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/39e04e36
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/39e04e36
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/39e04e36
Branch: refs/heads/develop
Commit: 39e04e36ed72e8a40434e82db0d7f0f210f26f1d
Parents: 922439e
Author: Sumit Mohanty <sm...@hortonworks.com>
Authored: Fri Jan 30 16:54:26 2015 -0800
Committer: Sumit Mohanty <sm...@hortonworks.com>
Committed: Fri Jan 30 16:54:26 2015 -0800
----------------------------------------------------------------------
.../org/apache/slider/api/ResourceKeys.java | 13 ++++++++-
.../apache/slider/providers/ProviderRole.java | 11 ++++++--
.../slideram/SliderAMClientProvider.java | 3 +-
.../server/appmaster/SliderAppMaster.java | 1 -
.../slider/server/appmaster/state/AppState.java | 26 ++++++++++++++++--
.../appmaster/state/OutstandingRequest.java | 20 ++++++++++++--
.../server/appmaster/state/RoleHistory.java | 4 +--
.../server/appmaster/state/RoleStatus.java | 9 ++++++
.../TestMockAppStateDynamicRoles.groovy | 9 ++++++
.../TestRoleHistoryRequestTracking.groovy | 29 +++++++++++++++++++-
10 files changed, 110 insertions(+), 15 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
index 52633f4..50ca82f 100644
--- a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
+++ b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
@@ -100,7 +100,11 @@ public interface ResourceKeys {
*/
String COMPONENT_PLACEMENT_POLICY = "yarn.component.placement.policy";
-
+ /**
+ * Maximum number of node failures that can be tolerated by a component on a specific node
+ */
+ String NODE_FAILURE_THRESHOLD =
+ "yarn.node.failure.threshold";
/**
* maximum number of failed containers (in a single role)
@@ -130,6 +134,13 @@ public interface ResourceKeys {
int DEFAULT_CONTAINER_FAILURE_THRESHOLD = 5;
/**
+ * Default node failure threshold for a component instance: {@value}
+ * Should to be lower than default component failure threshold to allow
+ * the component to start elsewhere
+ */
+ int DEFAULT_NODE_FAILURE_THRESHOLD = 3;
+
+ /**
* Log aggregation include, exclude patterns
*/
String YARN_LOG_INCLUDE_PATTERNS = "yarn.log.include.patterns";
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
index 5b85f7b..17124d2 100644
--- a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
+++ b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
@@ -18,6 +18,8 @@
package org.apache.slider.providers;
+import org.apache.slider.api.ResourceKeys;
+
/**
* Provider role and key for use in app requests.
*
@@ -28,15 +30,17 @@ public final class ProviderRole {
public final String name;
public final int id;
public final int placementPolicy;
+ public final int nodeFailureThreshold;
public ProviderRole(String name, int id) {
- this(name, id, PlacementPolicy.DEFAULT);
+ this(name, id, PlacementPolicy.DEFAULT, ResourceKeys.DEFAULT_NODE_FAILURE_THRESHOLD);
}
- public ProviderRole(String name, int id, int policy) {
+ public ProviderRole(String name, int id, int policy, int nodeFailureThreshold) {
this.name = name;
this.id = id;
this.placementPolicy = policy;
+ this.nodeFailureThreshold = nodeFailureThreshold;
}
@Override
@@ -59,10 +63,11 @@ public final class ProviderRole {
@Override
public String toString() {
- return "ProviderRole{" +
+ return "ProviderRole {" +
"name='" + name + '\'' +
", id=" + id +
", policy=" + placementPolicy +
+ ", nodeFailureThreshold=" + nodeFailureThreshold +
'}';
}
}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMClientProvider.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMClientProvider.java b/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMClientProvider.java
index b790713..1666c84 100644
--- a/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMClientProvider.java
+++ b/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMClientProvider.java
@@ -88,7 +88,8 @@ public class SliderAMClientProvider extends AbstractClientProvider
public static final ProviderRole APPMASTER =
new ProviderRole(COMPONENT_AM, KEY_AM,
- PlacementPolicy.EXCLUDE_FROM_FLEXING);
+ PlacementPolicy.EXCLUDE_FROM_FLEXING,
+ ResourceKeys.DEFAULT_NODE_FAILURE_THRESHOLD);
/**
* Initialize role list
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
index 2629a4d..b49366e 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
@@ -945,7 +945,6 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
* Creates and starts the web application, and adds a
* <code>WebAppService</code> service under the AM, to ensure
* a managed web application shutdown.
- * @param serviceConf AM configuration
* @param port port to deploy the web application on
* @param webAppApi web app API instance
*/
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
index 95a7ca5..61b0cd6 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
@@ -276,6 +276,7 @@ public class AppState {
private long startTimeThreshold;
private int failureThreshold = 10;
+ private int nodeFailureThreshold = 3;
private String logServerURL = "";
@@ -559,6 +560,9 @@ public class AppState {
failureThreshold = globalResOpts.getOptionInt(
ResourceKeys.CONTAINER_FAILURE_THRESHOLD,
ResourceKeys.DEFAULT_CONTAINER_FAILURE_THRESHOLD);
+ nodeFailureThreshold = globalResOpts.getOptionInt(
+ ResourceKeys.NODE_FAILURE_THRESHOLD,
+ ResourceKeys.DEFAULT_NODE_FAILURE_THRESHOLD);
initClusterStatus();
@@ -625,7 +629,8 @@ public class AppState {
int placement = SliderUtils.parseAndValidate("value of " + name + " " +
ResourceKeys.COMPONENT_PLACEMENT_POLICY,
placementOpt, 0, 0, -1);
- ProviderRole newRole = new ProviderRole(name, priority, placement);
+ ProviderRole newRole = new ProviderRole(name, priority, placement,
+ getNodeFailureThresholdForRole(name));
log.info("New {} ", newRole);
return newRole;
}
@@ -1376,6 +1381,7 @@ public class AppState {
if (started > 0) {
long duration = time - started;
shortlived = duration < (startTimeThreshold * 1000);
+ log.info("Duration {} and startTimeThreshold {}", duration, startTimeThreshold);
} else {
// never even saw a start event
shortlived = true;
@@ -1704,7 +1710,21 @@ public class AppState {
ResourceKeys.CONTAINER_FAILURE_THRESHOLD,
failureThreshold);
}
-
+
+ /**
+ * Get the node failure threshold for a specific role, falling back to
+ * the global one if not
+ * @param roleName role name
+ * @return the threshold for failures
+ */
+ private int getNodeFailureThresholdForRole(String roleName) {
+ ConfTreeOperations resources =
+ instanceDefinition.getResourceOperations();
+ return resources.getComponentOptInt(roleName,
+ ResourceKeys.NODE_FAILURE_THRESHOLD,
+ nodeFailureThreshold);
+ }
+
/**
* Reset the failure counts of all roles
*/
@@ -1712,7 +1732,7 @@ public class AppState {
for (RoleStatus roleStatus : getRoleStatusMap().values()) {
int failed = roleStatus.resetFailed();
log.info("Resetting failure count of {}; was {}",
- roleStatus.getName(),
+ roleStatus.getName(),
failed);
}
roleHistory.resetFailedRecently();
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java
index d6022e0..6acac89 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java
@@ -100,12 +100,26 @@ public final class OutstandingRequest {
* @param labelExpression label to satisfy
* @return the request to raise
*/
- public AMRMClient.ContainerRequest buildContainerRequest(Resource resource,
- RoleStatus role, long time, String labelExpression) {
+ public AMRMClient.ContainerRequest buildContainerRequest(
+ Resource resource, RoleStatus role, long time, String labelExpression) {
String[] hosts;
boolean relaxLocality;
requestedTime = time;
- if (node != null) {
+ boolean usePlacementHistory = role.isStrictPlacement();
+ if (!usePlacementHistory) {
+ // If strict placement does not mandate using placement then check
+ // that the recent failures on this node is not higher than threshold
+ if (node != null) {
+ int numFailuresOnLastHost = node.get(role.getKey()).getFailedRecently();
+ usePlacementHistory = numFailuresOnLastHost <= role.getNodeFailureThreshold();
+ if(!usePlacementHistory) {
+ log.info("Recent node failures {} is higher than threshold {}. Dropping host {} from preference.",
+ numFailuresOnLastHost, role.getNodeFailureThreshold(), node.hostname);
+ }
+ }
+ }
+
+ if (node != null && usePlacementHistory) {
hosts = new String[1];
hosts[0] = node.hostname;
relaxLocality = !role.isStrictPlacement();
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
index 605a4f8..e94457a 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
@@ -443,7 +443,7 @@ public class RoleHistory {
/**
* Get the nodes for an ID -may be null
* @param id role ID
- * @return potenially null list
+ * @return potentially null list
*/
@VisibleForTesting
public List<NodeInstance> getNodesForRoleId(int id) {
@@ -755,7 +755,7 @@ public class RoleHistory {
boolean wasReleased,
boolean shortLived) {
NodeEntry nodeEntry = getOrCreateNodeEntry(container);
- log.debug("Finished container for node {}, released={}, shortlived={}",
+ log.info("Finished container for node {}, released={}, shortlived={}",
nodeEntry.rolePriority, wasReleased, shortLived);
boolean available;
if (shortLived) {
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
index 3edc5f1..22c5164 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
@@ -76,6 +76,15 @@ public final class RoleStatus implements Cloneable {
return providerRole.placementPolicy;
}
+ /**
+ * The number of failures on a specific node that can be tolerated
+ * before selecting a different node for placement
+ * @return
+ */
+ public int getNodeFailureThreshold() {
+ return providerRole.nodeFailureThreshold;
+ }
+
public boolean getExcludeFromFlexing() {
return 0 != (getPlacementPolicy() & PlacementPolicy.EXCLUDE_FROM_FLEXING);
}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateDynamicRoles.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateDynamicRoles.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateDynamicRoles.groovy
index 83fb273..13ecf13 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateDynamicRoles.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateDynamicRoles.groovy
@@ -84,6 +84,8 @@ class TestMockAppStateDynamicRoles extends BaseMockAppStateTest
(ResourceKeys.COMPONENT_INSTANCES): "1",
(ResourceKeys.COMPONENT_PLACEMENT_POLICY):
Integer.toString(PlacementPolicy.STRICT),
+ (ResourceKeys.NODE_FAILURE_THRESHOLD):
+ Integer.toString(2),
]
instance.resourceOperations.components[ROLE5]= opts5
@@ -148,6 +150,13 @@ class TestMockAppStateDynamicRoles extends BaseMockAppStateTest
}
@Test
+ public void testNodeFailureThresholdPropagation() throws Throwable {
+ assert (appState.lookupRoleStatus(ROLE4).nodeFailureThreshold == 3)
+ assert (appState.lookupRoleStatus(ROLE5).nodeFailureThreshold == 2)
+
+ }
+
+ @Test
public void testLaxPlacementSecondRequestRole4() throws Throwable {
log.info("Initial engine state = $engine")
def role4 = appState.lookupRoleStatus(ROLE4)
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy
index 8f577e5..d87222d 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy
@@ -82,7 +82,34 @@ class TestRoleHistoryRequestTracking extends BaseMockAppStateTest {
List<NodeInstance> a2 = roleHistory.cloneAvailableList(0)
assertListEquals([age2Active0], a2)
}
-
+
+ @Test
+ public void testRequestedNodeOffListWithFailures() throws Throwable {
+ NodeInstance ni = roleHistory.findNodeForNewInstance(roleStatus)
+ assert age3Active0 == ni
+ AMRMClient.ContainerRequest req = roleHistory.requestInstanceOnNode(ni,
+ roleStatus,
+ resource,
+ "")
+ assert 1 == req.nodes.size()
+ List<NodeInstance> a2 = roleHistory.cloneAvailableList(0)
+ assertListEquals([age2Active0], a2)
+
+ age3Active0.get(0).failedRecently = 4
+ req = roleHistory.requestInstanceOnNode(ni,
+ roleStatus,
+ resource,
+ "")
+ assertNull(req.nodes)
+
+ age3Active0.get(0).failedRecently = 0
+ req = roleHistory.requestInstanceOnNode(ni,
+ roleStatus,
+ resource,
+ "")
+ assert 1 == req.nodes.size()
+ }
+
@Test
public void testFindAndRequestNode() throws Throwable {
AMRMClient.ContainerRequest req = roleHistory.requestNode(roleStatus, resource)