You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@slider.apache.org by sm...@apache.org on 2015/01/31 01:54:47 UTC

incubator-slider git commit: SLIDER-743. Include node failure history when choosing placement hints

Repository: incubator-slider
Updated Branches:
  refs/heads/develop 922439e58 -> 39e04e36e


SLIDER-743. Include node failure history when choosing placement hints


Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/39e04e36
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/39e04e36
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/39e04e36

Branch: refs/heads/develop
Commit: 39e04e36ed72e8a40434e82db0d7f0f210f26f1d
Parents: 922439e
Author: Sumit Mohanty <sm...@hortonworks.com>
Authored: Fri Jan 30 16:54:26 2015 -0800
Committer: Sumit Mohanty <sm...@hortonworks.com>
Committed: Fri Jan 30 16:54:26 2015 -0800

----------------------------------------------------------------------
 .../org/apache/slider/api/ResourceKeys.java     | 13 ++++++++-
 .../apache/slider/providers/ProviderRole.java   | 11 ++++++--
 .../slideram/SliderAMClientProvider.java        |  3 +-
 .../server/appmaster/SliderAppMaster.java       |  1 -
 .../slider/server/appmaster/state/AppState.java | 26 ++++++++++++++++--
 .../appmaster/state/OutstandingRequest.java     | 20 ++++++++++++--
 .../server/appmaster/state/RoleHistory.java     |  4 +--
 .../server/appmaster/state/RoleStatus.java      |  9 ++++++
 .../TestMockAppStateDynamicRoles.groovy         |  9 ++++++
 .../TestRoleHistoryRequestTracking.groovy       | 29 +++++++++++++++++++-
 10 files changed, 110 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
index 52633f4..50ca82f 100644
--- a/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
+++ b/slider-core/src/main/java/org/apache/slider/api/ResourceKeys.java
@@ -100,7 +100,11 @@ public interface ResourceKeys {
    */
   String COMPONENT_PLACEMENT_POLICY = "yarn.component.placement.policy";
 
-  
+  /**
+   * Maximum number of node failures that can be tolerated by a component on a specific node
+   */
+  String NODE_FAILURE_THRESHOLD =
+      "yarn.node.failure.threshold";
 
   /**
    * maximum number of failed containers (in a single role)
@@ -130,6 +134,13 @@ public interface ResourceKeys {
   int DEFAULT_CONTAINER_FAILURE_THRESHOLD = 5;
 
   /**
+   * Default node failure threshold for a component instance: {@value}
+   * Should to be lower than default component failure threshold to allow
+   * the component to start elsewhere
+   */
+  int DEFAULT_NODE_FAILURE_THRESHOLD = 3;
+
+  /**
    * Log aggregation include, exclude patterns
    */
   String YARN_LOG_INCLUDE_PATTERNS = "yarn.log.include.patterns";

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
index 5b85f7b..17124d2 100644
--- a/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
+++ b/slider-core/src/main/java/org/apache/slider/providers/ProviderRole.java
@@ -18,6 +18,8 @@
 
 package org.apache.slider.providers;
 
+import org.apache.slider.api.ResourceKeys;
+
 /**
  * Provider role and key for use in app requests.
  * 
@@ -28,15 +30,17 @@ public final class ProviderRole {
   public final String name;
   public final int id;
   public final int placementPolicy;
+  public final int nodeFailureThreshold;
 
   public ProviderRole(String name, int id) {
-    this(name, id, PlacementPolicy.DEFAULT);
+    this(name, id, PlacementPolicy.DEFAULT, ResourceKeys.DEFAULT_NODE_FAILURE_THRESHOLD);
   }
 
-  public ProviderRole(String name, int id, int policy) {
+  public ProviderRole(String name, int id, int policy, int nodeFailureThreshold) {
     this.name = name;
     this.id = id;
     this.placementPolicy = policy;
+    this.nodeFailureThreshold = nodeFailureThreshold;
   }
 
   @Override
@@ -59,10 +63,11 @@ public final class ProviderRole {
 
   @Override
   public String toString() {
-    return "ProviderRole{" +
+    return "ProviderRole {" +
            "name='" + name + '\'' +
            ", id=" + id +
            ", policy=" + placementPolicy +
+           ", nodeFailureThreshold=" + nodeFailureThreshold +
            '}';
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMClientProvider.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMClientProvider.java b/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMClientProvider.java
index b790713..1666c84 100644
--- a/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMClientProvider.java
+++ b/slider-core/src/main/java/org/apache/slider/providers/slideram/SliderAMClientProvider.java
@@ -88,7 +88,8 @@ public class SliderAMClientProvider extends AbstractClientProvider
 
   public static final ProviderRole APPMASTER =
       new ProviderRole(COMPONENT_AM, KEY_AM,
-          PlacementPolicy.EXCLUDE_FROM_FLEXING);
+          PlacementPolicy.EXCLUDE_FROM_FLEXING,
+          ResourceKeys.DEFAULT_NODE_FAILURE_THRESHOLD);
 
   /**
    * Initialize role list

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
index 2629a4d..b49366e 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
@@ -945,7 +945,6 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
    *   Creates and starts the web application, and adds a
    *   <code>WebAppService</code> service under the AM, to ensure
    *   a managed web application shutdown.
-   * @param serviceConf AM configuration
    * @param port port to deploy the web application on
    * @param webAppApi web app API instance
    */

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
index 95a7ca5..61b0cd6 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
@@ -276,6 +276,7 @@ public class AppState {
   private long startTimeThreshold;
   
   private int failureThreshold = 10;
+  private int nodeFailureThreshold = 3;
   
   private String logServerURL = "";
   
@@ -559,6 +560,9 @@ public class AppState {
     failureThreshold = globalResOpts.getOptionInt(
         ResourceKeys.CONTAINER_FAILURE_THRESHOLD,
         ResourceKeys.DEFAULT_CONTAINER_FAILURE_THRESHOLD);
+    nodeFailureThreshold = globalResOpts.getOptionInt(
+        ResourceKeys.NODE_FAILURE_THRESHOLD,
+        ResourceKeys.DEFAULT_NODE_FAILURE_THRESHOLD);
     initClusterStatus();
 
 
@@ -625,7 +629,8 @@ public class AppState {
     int placement = SliderUtils.parseAndValidate("value of " + name + " " +
         ResourceKeys.COMPONENT_PLACEMENT_POLICY,
         placementOpt, 0, 0, -1);
-    ProviderRole newRole = new ProviderRole(name, priority, placement);
+    ProviderRole newRole = new ProviderRole(name, priority, placement,
+                                            getNodeFailureThresholdForRole(name));
     log.info("New {} ", newRole);
     return newRole;
   }
@@ -1376,6 +1381,7 @@ public class AppState {
     if (started > 0) {
       long duration = time - started;
       shortlived = duration < (startTimeThreshold * 1000);
+      log.info("Duration {} and startTimeThreshold {}", duration, startTimeThreshold);
     } else {
       // never even saw a start event
       shortlived = true;
@@ -1704,7 +1710,21 @@ public class AppState {
         ResourceKeys.CONTAINER_FAILURE_THRESHOLD,
         failureThreshold);
   }
-  
+
+  /**
+   * Get the node failure threshold for a specific role, falling back to
+   * the global one if not
+   * @param roleName role name
+   * @return the threshold for failures
+   */
+  private int getNodeFailureThresholdForRole(String roleName) {
+    ConfTreeOperations resources =
+        instanceDefinition.getResourceOperations();
+    return resources.getComponentOptInt(roleName,
+                                        ResourceKeys.NODE_FAILURE_THRESHOLD,
+                                        nodeFailureThreshold);
+  }
+
   /**
    * Reset the failure counts of all roles
    */
@@ -1712,7 +1732,7 @@ public class AppState {
     for (RoleStatus roleStatus : getRoleStatusMap().values()) {
       int failed = roleStatus.resetFailed();
       log.info("Resetting failure count of {}; was {}",
-          roleStatus.getName(),
+               roleStatus.getName(),
           failed);
     }
     roleHistory.resetFailedRecently();

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java
index d6022e0..6acac89 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java
@@ -100,12 +100,26 @@ public final class OutstandingRequest {
    * @param labelExpression label to satisfy
    * @return the request to raise
    */
-  public AMRMClient.ContainerRequest buildContainerRequest(Resource resource,
-      RoleStatus role, long time, String labelExpression) {
+  public AMRMClient.ContainerRequest buildContainerRequest(
+      Resource resource, RoleStatus role, long time, String labelExpression) {
     String[] hosts;
     boolean relaxLocality;
     requestedTime = time;
-    if (node != null) {
+    boolean usePlacementHistory = role.isStrictPlacement();
+    if (!usePlacementHistory) {
+      // If strict placement does not mandate using placement then check
+      // that the recent failures on this node is not higher than threshold
+      if (node != null) {
+        int numFailuresOnLastHost = node.get(role.getKey()).getFailedRecently();
+        usePlacementHistory = numFailuresOnLastHost <= role.getNodeFailureThreshold();
+        if(!usePlacementHistory) {
+          log.info("Recent node failures {} is higher than threshold {}. Dropping host {} from preference.",
+                   numFailuresOnLastHost, role.getNodeFailureThreshold(), node.hostname);
+        }
+      }
+    }
+
+    if (node != null && usePlacementHistory) {
       hosts = new String[1];
       hosts[0] = node.hostname;
       relaxLocality = !role.isStrictPlacement();

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
index 605a4f8..e94457a 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
@@ -443,7 +443,7 @@ public class RoleHistory {
   /**
    * Get the nodes for an ID -may be null
    * @param id role ID
-   * @return potenially null list
+   * @return potentially null list
    */
   @VisibleForTesting
   public List<NodeInstance> getNodesForRoleId(int id) {
@@ -755,7 +755,7 @@ public class RoleHistory {
                                                        boolean wasReleased,
                                                        boolean shortLived) {
     NodeEntry nodeEntry = getOrCreateNodeEntry(container);
-    log.debug("Finished container for node {}, released={}, shortlived={}",
+    log.info("Finished container for node {}, released={}, shortlived={}",
         nodeEntry.rolePriority, wasReleased, shortLived);
     boolean available;
     if (shortLived) {

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
index 3edc5f1..22c5164 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleStatus.java
@@ -76,6 +76,15 @@ public final class RoleStatus implements Cloneable {
     return providerRole.placementPolicy;
   }
 
+  /**
+   * The number of failures on a specific node that can be tolerated
+   * before selecting a different node for placement
+   * @return
+   */
+  public int getNodeFailureThreshold() {
+    return providerRole.nodeFailureThreshold;
+  }
+
   public boolean getExcludeFromFlexing() {
     return 0 != (getPlacementPolicy() & PlacementPolicy.EXCLUDE_FROM_FLEXING);
   }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateDynamicRoles.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateDynamicRoles.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateDynamicRoles.groovy
index 83fb273..13ecf13 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateDynamicRoles.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/appstate/TestMockAppStateDynamicRoles.groovy
@@ -84,6 +84,8 @@ class TestMockAppStateDynamicRoles extends BaseMockAppStateTest
         (ResourceKeys.COMPONENT_INSTANCES): "1",
         (ResourceKeys.COMPONENT_PLACEMENT_POLICY):
             Integer.toString(PlacementPolicy.STRICT),
+        (ResourceKeys.NODE_FAILURE_THRESHOLD):
+            Integer.toString(2),
     ]
 
     instance.resourceOperations.components[ROLE5]= opts5
@@ -148,6 +150,13 @@ class TestMockAppStateDynamicRoles extends BaseMockAppStateTest
   }
 
   @Test
+  public void testNodeFailureThresholdPropagation() throws Throwable {
+    assert (appState.lookupRoleStatus(ROLE4).nodeFailureThreshold == 3)
+    assert (appState.lookupRoleStatus(ROLE5).nodeFailureThreshold == 2)
+
+  }
+
+  @Test
   public void testLaxPlacementSecondRequestRole4() throws Throwable {
     log.info("Initial engine state = $engine")
     def role4 = appState.lookupRoleStatus(ROLE4)

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/39e04e36/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy
index 8f577e5..d87222d 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryRequestTracking.groovy
@@ -82,7 +82,34 @@ class TestRoleHistoryRequestTracking extends BaseMockAppStateTest {
     List<NodeInstance> a2 = roleHistory.cloneAvailableList(0)
     assertListEquals([age2Active0], a2)
   }
-  
+
+  @Test
+  public void testRequestedNodeOffListWithFailures() throws Throwable {
+    NodeInstance ni = roleHistory.findNodeForNewInstance(roleStatus)
+    assert age3Active0 == ni
+    AMRMClient.ContainerRequest req = roleHistory.requestInstanceOnNode(ni,
+        roleStatus,
+        resource,
+        "")
+    assert 1 == req.nodes.size()
+    List<NodeInstance> a2 = roleHistory.cloneAvailableList(0)
+    assertListEquals([age2Active0], a2)
+
+    age3Active0.get(0).failedRecently = 4
+    req = roleHistory.requestInstanceOnNode(ni,
+        roleStatus,
+        resource,
+        "")
+    assertNull(req.nodes)
+
+    age3Active0.get(0).failedRecently = 0
+    req = roleHistory.requestInstanceOnNode(ni,
+        roleStatus,
+        resource,
+        "")
+    assert 1 == req.nodes.size()
+  }
+
   @Test
   public void testFindAndRequestNode() throws Throwable {
     AMRMClient.ContainerRequest req = roleHistory.requestNode(roleStatus, resource)