You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@slider.apache.org by st...@apache.org on 2015/11/12 19:15:17 UTC

[02/12] incubator-slider git commit: SLIDER-967 Use nodemap to build up location restrictions on AA placement

SLIDER-967 Use nodemap to build up location restrictions on AA placement


Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/2606192a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/2606192a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/2606192a

Branch: refs/heads/feature/SLIDER-82-pass-3.1
Commit: 2606192ab0685fc267fcd5f3670e13b93a7a83b6
Parents: 89fd701
Author: Steve Loughran <st...@apache.org>
Authored: Wed Nov 11 13:54:15 2015 +0000
Committer: Steve Loughran <st...@apache.org>
Committed: Wed Nov 11 13:54:15 2015 +0000

----------------------------------------------------------------------
 .../slider/server/appmaster/state/AppState.java | 15 ++-
 .../server/appmaster/state/NodeInstance.java    | 74 +++++++++++++--
 .../slider/server/appmaster/state/NodeMap.java  | 44 ++++++++-
 .../appmaster/state/OutstandingRequest.java     | 67 +++----------
 .../state/OutstandingRequestTracker.java        | 23 ++---
 .../server/appmaster/state/RoleHistory.java     | 70 +++++++-------
 .../appmaster/state/RoleHostnamePair.java       | 75 +++++++++++++++
 .../server/appmaster/web/SliderAMWebApp.java    |  2 +-
 .../model/history/TestRoleHistoryAA.groovy      | 98 +++++++++++++++++++-
 .../TestRoleHistoryContainerEvents.groovy       |  5 +-
 .../appmaster/model/mock/MockNodeReport.groovy  | 34 +++++++
 11 files changed, 376 insertions(+), 131 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/2606192a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
index f74fe98..063a7fc 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
@@ -1427,7 +1427,11 @@ public class AppState {
    * @param updatedNodes updated nodes
    */
   public synchronized void onNodesUpdated(List<NodeReport> updatedNodes) {
-    roleHistory.onNodesUpdated(updatedNodes);
+    boolean changed = roleHistory.onNodesUpdated(updatedNodes);
+    if (changed) {
+      //TODO
+      log.error("TODO: cancel AA requests and re-review");
+    }
   }
 
   /**
@@ -1923,13 +1927,18 @@ public class AppState {
 
       if (isAA) {
         // build one only if there is none outstanding
-        if (role.getPendingAntiAffineRequests() == 0) {
+        if (role.getPendingAntiAffineRequests() == 0
+            && roleHistory.canPlaceAANodes()) {
           log.info("Starting an anti-affine request sequence for {} nodes", delta);
           // log the number outstanding
           role.incPendingAntiAffineRequests(delta - 1);
           addContainerRequest(operations, createContainerRequest(role));
         } else {
-          log.info("Adding {} more anti-affine requests", delta);
+          if (roleHistory.canPlaceAANodes()) {
+            log.info("Adding {} more anti-affine requests", delta);
+          } else {
+            log.warn("Awaiting node map before generating node requests");
+          }
           role.incPendingAntiAffineRequests(delta);
         }
       } else {

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/2606192a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeInstance.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeInstance.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeInstance.java
index 7fc912d..b805ffb 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeInstance.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeInstance.java
@@ -20,16 +20,15 @@ package org.apache.slider.server.appmaster.state;
 
 import org.apache.hadoop.yarn.api.records.NodeReport;
 import org.apache.hadoop.yarn.api.records.NodeState;
-import org.apache.slider.api.types.NodeEntryInformation;
 import org.apache.slider.api.types.NodeInformation;
 import org.apache.slider.common.tools.Comparators;
-import org.apache.slider.common.tools.SliderUtils;
 
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.List;
 import java.util.ListIterator;
+import java.util.Set;
 
 /**
  * A node instance -stores information about a node in the cluster.
@@ -46,6 +45,9 @@ public class NodeInstance {
    */
   private NodeState nodeState = NodeState.RUNNING;
 
+  /**
+   * Last node report. If null: none
+   */
   private NodeReport nodeReport = null;
 
   /**
@@ -53,6 +55,17 @@ public class NodeInstance {
    */
   private long nodeStateUpdateTime = 0;
 
+  /**
+   * Node labels.
+   *
+   * IMPORTANT: we assume that there is one label/node, which is the policy
+   * for Hadoop as of November 2015
+   */
+  private String nodeLabels = "";
+
+  /**
+   * The list of node entries of specific roles
+   */
   private final List<NodeEntry> nodeEntries;
 
   /**
@@ -64,18 +77,41 @@ public class NodeInstance {
     nodeEntries = new ArrayList<>(roles);
   }
 
-
   /**
-   * Update the node status
+   * Update the node status.
+   * The return code is true if the node state changed enough to
+   * trigger a re-evaluation of pending requests. That is, either a node
+   * became available when it was previously not, or the label changed
+   * on an available node.
+   *
+   * Transitions of a node from live to dead aren't treated as significant,
+   * nor label changes on a dead node.
+   *
    * @param report latest node report
-   * @return true if the node state changed
+   * @return true if the node state changed enough for a request evaluation.
    */
   public synchronized boolean updateNode(NodeReport report) {
+    nodeStateUpdateTime = report.getLastHealthReportTime();
     nodeReport = report;
     NodeState oldState = nodeState;
+    boolean oldStateUnusable = oldState.isUnusable();
     nodeState = report.getNodeState();
-    nodeStateUpdateTime = report.getLastHealthReportTime();
-    return nodeState != oldState;
+    boolean newUsable = !nodeState.isUnusable();
+    boolean nodeNowAvailable = oldStateUnusable && newUsable;
+    String labels = this.nodeLabels;
+    Set<String> newlabels = report.getNodeLabels();
+    if (newlabels != null && !newlabels.isEmpty()) {
+      nodeLabels = newlabels.iterator().next().trim();
+    } else {
+      nodeLabels = "";
+    }
+    return nodeNowAvailable
+        || newUsable && !this.nodeLabels.equals(labels);
+  }
+
+
+  public String getNodeLabels() {
+    return nodeLabels;
   }
 
   /**
@@ -130,6 +166,14 @@ public class NodeInstance {
   }
 
   /**
+   * Is the node considered online
+   * @return the node
+   */
+  public boolean isOnline() {
+    return !nodeState.isUnusable();
+  }
+
+  /**
    * Query for a node being considered unreliable
    * @param role role key
    * @param threshold threshold above which a node is considered unreliable
@@ -137,7 +181,6 @@ public class NodeInstance {
    */
   public boolean isConsideredUnreliable(int role, int threshold) {
     NodeEntry entry = get(role);
-
     return entry != null && entry.getFailedRecently() > threshold;
   }
 
@@ -258,10 +301,10 @@ public class NodeInstance {
     // null-handling state constructor
     info.state = "" + nodeState;
     info.lastUpdated = nodeStateUpdateTime;
+    info.labels = nodeLabels;
     if (nodeReport != null) {
       info.httpAddress = nodeReport.getHttpAddress();
       info.rackName = nodeReport.getRackName();
-      info.labels = SliderUtils.join(nodeReport.getNodeLabels(), ", ", false);
       info.healthReport = nodeReport.getHealthReport();
     }
     info.entries = new ArrayList<>(nodeEntries.size());
@@ -272,6 +315,19 @@ public class NodeInstance {
   }
 
   /**
+   * Is this node instance a suitable candidate for the specific role?
+   * @param role role ID
+   * @param label label which must match, or "" for no label checks
+   * @return true if the node has space for this role, is running and the labels
+   * match.
+   */
+  public boolean canHost(int role, String label) {
+    return isOnline()
+        && (label.isEmpty() || label.equals(nodeLabels))   // label match
+        && (get(role) == null || get(role).isAvailable()); // no live role
+  }
+
+  /**
    * A comparator for sorting entries where the node is preferred over another.
    *
    * The exact algorithm may change: current policy is "most recent first", so sorted

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/2606192a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeMap.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeMap.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeMap.java
index b631057..2887c9e 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeMap.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/NodeMap.java
@@ -92,20 +92,24 @@ public class NodeMap extends HashMap<String, NodeInstance> {
     }
   }
 
-
   /**
-   * Update the node state
+   * Update the node state. Return true if the node state changed: either by
+   * being created, or by changing its internal state as defined
+   * by {@link NodeInstance#updateNode(NodeReport)}.
+   *
    * @param hostname host name
    * @param report latest node report
-   * @return the updated node.
+   * @return true if the node state changed enough for a request evaluation.
    */
   public boolean updateNode(String hostname, NodeReport report) {
-    return getOrCreate(hostname).updateNode(report);
+    boolean nodeExisted = get(hostname) != null;
+    boolean updated = getOrCreate(hostname).updateNode(report);
+    return updated || !nodeExisted;
   }
 
   /**
    * Clone point
-   * @return
+   * @return a shallow clone
    */
   @Override
   public Object clone() {
@@ -123,4 +127,34 @@ public class NodeMap extends HashMap<String, NodeInstance> {
       put(node.hostname, node);
     }
   }
+
+  /**
+   * Test helper: build or update a cluster from a list of node reports
+   * @param reports the list of reports
+   * @return true if this has been considered to have changed the cluster
+   */
+  @VisibleForTesting
+  public boolean buildOrUpdate(List<NodeReport> reports) {
+    boolean updated = false;
+    for (NodeReport report : reports) {
+      updated |= getOrCreate(report.getNodeId().getHost()).updateNode(report);
+    }
+    return updated;
+  }
+
+  /**
+   * Scan the current node map for all nodes capable of hosting an instance
+   * @param role role ID
+   * @param label label which must match, or "" for no label checks
+   * @return a list of node instances matching the criteria.
+   */
+  public List<NodeInstance> findNodesForRole(int role, String label) {
+    List<NodeInstance> nodes = new ArrayList<>(size());
+    for (NodeInstance instance : values()) {
+      if (instance.canHost(role, label)) {
+        nodes.add(instance);
+      }
+    }
+    return nodes;
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/2606192a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java
index 38bc96f..a9d4b52 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequest.java
@@ -43,24 +43,14 @@ import java.util.List;
  * instance constructed with (role, hostname) can be used to look up
  * a complete request instance in the {@link OutstandingRequestTracker} map
  */
-public final class OutstandingRequest {
+public final class OutstandingRequest extends RoleHostnamePair {
   protected static final Logger log =
     LoggerFactory.getLogger(OutstandingRequest.class);
 
   /**
-   * requested role
-   */
-  public final int roleId;
-
-  /**
    * Node the request is for -may be null
    */
   public final NodeInstance node;
-  
-  /**
-   * hostname -will be null if node==null
-   */
-  public final String hostname;
 
   /**
    * Optional label. This is cached as the request option (explicit-location + label) is forbidden,
@@ -111,9 +101,8 @@ public final class OutstandingRequest {
    */
   public OutstandingRequest(int roleId,
                             NodeInstance node) {
-    this.roleId = roleId;
+    super(roleId, node != null ? node.hostname : null);
     this.node = node;
-    this.hostname = node != null ? node.hostname : null;
   }
 
   /**
@@ -125,9 +114,8 @@ public final class OutstandingRequest {
    * @param hostname hostname
    */
   public OutstandingRequest(int roleId, String hostname) {
+    super(roleId, hostname);
     this.node = null;
-    this.roleId = roleId;
-    this.hostname = hostname;
   }
 
   /**
@@ -301,52 +289,13 @@ public final class OutstandingRequest {
     return issuedRequest != null && issuedRequest.getCapability().equals(resource);
   }
 
-  /**
-   * Equality is on hostname and role
-   * @param o other
-   * @return true on a match
-   */
-  @Override
-  public boolean equals(Object o) {
-    if (this == o) {
-      return true;
-    }
-    if (o == null || getClass() != o.getClass()) {
-      return false;
-    }
-
-    OutstandingRequest request = (OutstandingRequest) o;
-
-    if (roleId != request.roleId) {
-      return false;
-    }
-    if (hostname != null
-        ? !hostname.equals(request.hostname)
-        : request.hostname != null) {
-      return false;
-    }
-    return true;
-  }
-
-  /**
-   * hash on hostname and role
-   * @return hash code
-   */
-  @Override
-  public int hashCode() {
-    int result = roleId;
-    result = 31 * result + (hostname != null ? hostname.hashCode() : 0);
-    return result;
-  }
-
   @Override
   public String toString() {
     int requestRoleId = ContainerPriority.extractRole(getPriority());
     boolean requestHasLocation = ContainerPriority.hasLocation(getPriority());
     final StringBuilder sb = new StringBuilder("OutstandingRequest{");
-    sb.append("roleId=").append(this.roleId);
+    sb.append(super.toString());
     sb.append(", node=").append(node);
-    sb.append(", hostname='").append(hostname).append('\'');
     sb.append(", hasLocation=").append(requestHasLocation);
     sb.append(", requestedTimeMillis=").append(requestedTimeMillis);
     sb.append(", mayEscalate=").append(mayEscalate);
@@ -367,7 +316,6 @@ public final class OutstandingRequest {
     return new CancelSingleRequest(issuedRequest);
   }
 
-
   /**
    * Valid if a node label expression specified on container request is valid or
    * not. Mimics the logic in AMRMClientImpl, so can be used for preflight checking
@@ -410,5 +358,12 @@ public final class OutstandingRequest {
     }
   }
 
+  /**
+   * Create a new role/hostname pair for indexing.
+   * @return a new index.
+   */
+  public RoleHostnamePair getIndex() {
+    return new RoleHostnamePair(roleId, hostname);
+  }
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/2606192a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequestTracker.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequestTracker.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequestTracker.java
index a791826..ecdc07a 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequestTracker.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/OutstandingRequestTracker.java
@@ -35,10 +35,12 @@ import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.ListIterator;
 import java.util.Map;
+import java.util.Set;
 
 /**
  * Tracks outstanding requests made with a specific placement option.
@@ -62,8 +64,7 @@ public class OutstandingRequestTracker {
    */
   private final List<AbstractRMOperation> NO_REQUESTS = new ArrayList<>(0);
  
-  private Map<OutstandingRequest, OutstandingRequest> placedRequests =
-    new HashMap<>();
+  private Map<RoleHostnamePair, OutstandingRequest> placedRequests = new HashMap<>();
 
   /**
    * List of open requests; no specific details on them.
@@ -82,10 +83,9 @@ public class OutstandingRequestTracker {
    * @return a new request
    */
   public synchronized OutstandingRequest newRequest(NodeInstance instance, int role) {
-    OutstandingRequest request =
-      new OutstandingRequest(role, instance);
+    OutstandingRequest request = new OutstandingRequest(role, instance);
     if (request.isLocated()) {
-      placedRequests.put(request, request);
+      placedRequests.put(request.getIndex(), request);
     } else {
       openRequests.add(request);
     }
@@ -101,7 +101,7 @@ public class OutstandingRequestTracker {
   @VisibleForTesting
   public synchronized OutstandingRequest lookupPlacedRequest(int role, String hostname) {
     Preconditions.checkArgument(hostname != null, "null hostname");
-    return placedRequests.get(new OutstandingRequest(role, hostname));
+    return placedRequests.get(new RoleHostnamePair(role, hostname));
   }
 
   /**
@@ -294,12 +294,12 @@ public class OutstandingRequestTracker {
    */
   public synchronized List<NodeInstance> resetOutstandingRequests(int role) {
     List<NodeInstance> hosts = new ArrayList<>();
-    Iterator<Map.Entry<OutstandingRequest,OutstandingRequest>> iterator =
+    Iterator<Map.Entry<RoleHostnamePair, OutstandingRequest>> iterator =
       placedRequests.entrySet().iterator();
     while (iterator.hasNext()) {
-      Map.Entry<OutstandingRequest, OutstandingRequest> next =
+      Map.Entry<RoleHostnamePair, OutstandingRequest> next =
         iterator.next();
-      OutstandingRequest request = next.getKey();
+      OutstandingRequest request = next.getValue();
       if (request.roleId == role) {
         iterator.remove();
         request.completed();
@@ -390,9 +390,10 @@ public class OutstandingRequestTracker {
    */
   public synchronized List<OutstandingRequest> extractPlacedRequestsForRole(int roleId, int count) {
     List<OutstandingRequest> results = new ArrayList<>();
-    Iterator<OutstandingRequest> iterator = placedRequests.keySet().iterator();
+    Iterator<Map.Entry<RoleHostnamePair, OutstandingRequest>>
+        iterator = placedRequests.entrySet().iterator();
     while (iterator.hasNext() && count > 0) {
-      OutstandingRequest request = iterator.next();
+      OutstandingRequest request = iterator.next().getValue();
       if (request.roleId == roleId) {
         results.add(request);
         count--;

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/2606192a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
index f8271a6..df1f4e1 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
@@ -109,12 +109,6 @@ public class RoleHistory {
   private Map<Integer, LinkedList<NodeInstance>> recentNodes;
 
   /**
-   * Track the failed nodes. Currently used to make wiser decision of container
-   * ask with/without locality. Has other potential uses as well.
-   */
-  private Set<String> failedNodes = new HashSet<>();
-
-  /**
    * Instantiate
    * @param roles initial role list
    * @param recordFactory yarn record factory
@@ -137,7 +131,6 @@ public class RoleHistory {
   protected synchronized void reset() throws BadConfigException {
 
     nodemap = new NodeMap(roleSize);
-    failedNodes = new HashSet<>();
     resetAvailableNodeLists();
     outstandingRequests = new OutstandingRequestTracker();
   }
@@ -578,7 +571,8 @@ public class RoleHistory {
       NodeInstance candidate = targets.get(i);
       if (candidate.getActiveRoleInstances(roleId) == 0) {
         // no active instances: check failure statistics
-        if (strictPlacement || !candidate.exceedsFailureThreshold(role)) {
+        if (strictPlacement
+            || (candidate.isOnline() && !candidate.exceedsFailureThreshold(role))) {
           targets.remove(i);
           // exit criteria for loop is now met
           nodeInstance = candidate;
@@ -779,6 +773,16 @@ public class RoleHistory {
   }
 
   /**
+   * Does the RoleHistory have enough information about the YARN cluster
+   * to start placing AA requests? That is: has it the node map and
+   * any label information needed?
+   * @return true if the caller can start requesting AA nodes
+   */
+  public boolean canPlaceAANodes() {
+    return nodeUpdateReceived.get();
+  }
+
+  /**
    * Get the last time the nodes were updated from YARN
    * @return the update time or zero if never updated.
    */
@@ -788,33 +792,35 @@ public class RoleHistory {
 
   /**
    * Update failedNodes and nodemap based on the node state
-   * 
+   *
    * @param updatedNodes list of updated nodes
+   * @return true if a review should be triggered.
    */
-  public synchronized void onNodesUpdated(List<NodeReport> updatedNodes) {
+  public synchronized boolean onNodesUpdated(List<NodeReport> updatedNodes) {
     log.debug("Updating {} nodes", updatedNodes.size());
     nodesUpdatedTime.set(now());
     nodeUpdateReceived.set(true);
+    int printed = 0;
+    boolean triggerReview = false;
     for (NodeReport updatedNode : updatedNodes) {
       String hostname = updatedNode.getNodeId() == null
-                        ? null 
-                        : updatedNode.getNodeId().getHost();
+          ? ""
+          : updatedNode.getNodeId().getHost();
       NodeState nodeState = updatedNode.getNodeState();
-      if (hostname == null || nodeState == null) {
+      if (hostname.isEmpty() || nodeState == null) {
+        log.warn("Ignoring incomplete update");
         continue;
       }
-      log.debug("host {} is in state {}", hostname, nodeState);
+      if (log.isDebugEnabled() && printed++ < 10) {
+        // log the first few, but avoid overloading the logs for a full cluster
+        // update
+        log.debug("Node \"{}\" is in state {}", hostname, nodeState);
+      }
       // update the node; this also creates an instance if needed
       boolean updated = nodemap.updateNode(hostname, updatedNode);
-      if (updated) {
-        if (nodeState.isUnusable()) {
-          log.info("Failed node {} state {}", hostname, nodeState);
-          failedNodes.add(hostname);
-        } else {
-          failedNodes.remove(hostname);
-        }
-      }
+      triggerReview |= updated;
     }
+    return triggerReview;
   }
 
   /**
@@ -852,7 +858,7 @@ public class RoleHistory {
 
   /**
    * Mark a container finished; if it was released then that is treated
-   * differently. history is touch()ed
+   * differently. history is {@code touch()}-ed
    *
    *
    * @param container completed container
@@ -917,9 +923,6 @@ public class RoleHistory {
     for (NodeInstance node : nodemap.values()) {
       log.info(node.toFullString());
     }
-
-    log.info("Failed nodes: {}",
-        SliderUtils.joinWithInnerSeparator(" ", failedNodes));
   }
 
   /**
@@ -963,17 +966,6 @@ public class RoleHistory {
   }
 
   /**
-   * Get a clone of the failedNodes
-   * 
-   * @return the list
-   */
-  public List<String> cloneFailedNodes() {
-    List<String> lst = new ArrayList<>();
-    lst.addAll(failedNodes);
-    return lst;
-  }
-
-  /**
    * Escalate operation as triggered by external timer.
    * @return a (usually empty) list of cancel/request operations.
    */
@@ -983,8 +975,8 @@ public class RoleHistory {
 
   /**
    * Build the list of requests to cancel from the outstanding list.
-   * @param role
-   * @param toCancel
+   * @param role role
+   * @param toCancel number to cancel
    * @return a list of cancellable operations.
    */
   public synchronized List<AbstractRMOperation> cancelRequestsForRole(RoleStatus role, int toCancel) {

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/2606192a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHostnamePair.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHostnamePair.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHostnamePair.java
new file mode 100644
index 0000000..920887a
--- /dev/null
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHostnamePair.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.slider.server.appmaster.state;
+
+import java.util.Objects;
+
+public class RoleHostnamePair {
+
+  /**
+   * requested role
+   */
+  public final int roleId;
+
+  /**
+   * hostname -will be null if node==null
+   */
+  public final String hostname;
+
+  public RoleHostnamePair(int roleId, String hostname) {
+    this.roleId = roleId;
+    this.hostname = hostname;
+  }
+
+  public int getRoleId() {
+    return roleId;
+  }
+
+  public String getHostname() {
+    return hostname;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) {
+      return true;
+    }
+    if (!(o instanceof RoleHostnamePair)) {
+      return false;
+    }
+    RoleHostnamePair that = (RoleHostnamePair) o;
+    return Objects.equals(roleId, that.roleId) &&
+        Objects.equals(hostname, that.hostname);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(roleId, hostname);
+  }
+
+  @Override
+  public String toString() {
+    final StringBuilder sb = new StringBuilder(
+        "RoleHostnamePair{");
+    sb.append("roleId=").append(roleId);
+    sb.append(", hostname='").append(hostname).append('\'');
+    sb.append('}');
+    return sb.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/2606192a/slider-core/src/main/java/org/apache/slider/server/appmaster/web/SliderAMWebApp.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/web/SliderAMWebApp.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/web/SliderAMWebApp.java
index 84f0eba..7ecc00c 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/web/SliderAMWebApp.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/web/SliderAMWebApp.java
@@ -92,7 +92,7 @@ public class SliderAMWebApp extends WebApp {
     String regex = "(?!/ws)";
     serveRegex(regex).with(SliderDefaultWrapperServlet.class); 
 
-    Map<String, String> params = new HashMap<String, String>();
+    Map<String, String> params = new HashMap<>();
     params.put(ResourceConfig.FEATURE_IMPLICIT_VIEWABLES, "true");
     params.put(ServletContainer.FEATURE_FILTER_FORWARD_ON_404, "true");
     params.put(ResourceConfig.FEATURE_XMLROOTELEMENT_PROCESSING, "true");

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/2606192a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryAA.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryAA.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryAA.groovy
index 36b9d66..f99326f 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryAA.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryAA.groovy
@@ -18,16 +18,106 @@
 
 package org.apache.slider.server.appmaster.model.history
 
-import org.apache.slider.server.appmaster.model.mock.BaseMockAppStateTest
+import groovy.util.logging.Slf4j
+import org.apache.hadoop.yarn.api.records.NodeReport
+import org.apache.hadoop.yarn.api.records.NodeState
+import org.apache.slider.server.appmaster.model.mock.MockNodeReport
+import org.apache.slider.server.appmaster.state.NodeEntry
+import org.apache.slider.server.appmaster.state.NodeInstance
+import org.apache.slider.server.appmaster.state.NodeMap
+import org.apache.slider.test.SliderTestBase
 import org.junit.Test
 
 /**
  * Test anti-affine
  */
-class TestRoleHistoryAA extends BaseMockAppStateTest {
+//@CompileStatic
+@Slf4j
+class TestRoleHistoryAA extends SliderTestBase {
+
+  List<String> hostnames = ["one", "two", "three"]
+  NodeMap nodeMap, gpuNodeMap
+
+  @Override
+  void setup() {
+    super.setup()
+    nodeMap = createNodeMap(hostnames, NodeState.RUNNING)
+    gpuNodeMap = createNodeMap(hostnames, NodeState.RUNNING, "GPU")
+
+  }
+
+  @Test
+  public void testFindNodesInFullCluster() throws Throwable {
+    // all three will surface at first
+    assertResultSize(3, nodeMap.findNodesForRole(1, ""))
+  }
+
+  @Test
+  public void testFindNodesInUnhealthyCluster() throws Throwable {
+    // all three will surface at first
+    nodeMap.get("one").updateNode(new MockNodeReport("one",NodeState.UNHEALTHY))
+    assertResultSize(2, nodeMap.findNodesForRole(1, ""))
+  }
+
+  @Test
+  public void testFindNoNodesWrongLabel() throws Throwable {
+    // all three will surface at first
+    assertResultSize(0, nodeMap.findNodesForRole(1, "GPU"))
+  }
+
+  @Test
+  public void testFindNoNodesRightLabel() throws Throwable {
+    // all three will surface at first
+    assertResultSize(3, gpuNodeMap.findNodesForRole(1, "GPU"))
+  }
 
   @Test
-  public void test() throws Throwable {
-    
+  public void testFindNoNodesNoLabel() throws Throwable {
+    // all three will surface at first
+    assertResultSize(3, gpuNodeMap.findNodesForRole(1, ""))
+  }
+
+  @Test
+  public void testFindNoNodesClusterRequested() throws Throwable {
+    // all three will surface at first
+    applyToNodeEntries(nodeMap) {
+      NodeEntry it -> it.request()
+    }
+    assertResultSize(0, nodeMap.findNodesForRole(1, ""))
+  }
+
+  @Test
+  public void testFindNoNodesClusterBusy() throws Throwable {
+    // all three will surface at first
+    applyToNodeEntries(nodeMap) {
+      NodeEntry it -> it.request()
+    }
+    assertResultSize(0, nodeMap.findNodesForRole(1, ""))
+  }
+
+  def assertResultSize(int size, List<NodeInstance> list) {
+    if (list.size() != size) {
+      list.each { log.error(it.toFullString())}
+    }
+    assert size == list.size()
+  }
+
+  def applyToNodeEntries(Collection<NodeInstance> list, Closure cl) {
+    list.each { it -> cl(it.getOrCreate(1)) }
+  }
+
+  def applyToNodeEntries(NodeMap nodeMap, Closure cl) {
+    applyToNodeEntries(nodeMap.values(), cl)
+  }
+
+  def NodeMap createNodeMap(List<NodeReport> nodeReports) {
+    NodeMap nodeMap = new NodeMap(1)
+    nodeMap.buildOrUpdate(nodeReports)
+    nodeMap
+  }
+
+  def NodeMap createNodeMap(List<String> hosts, NodeState state,
+      String label = "") {
+    createNodeMap(MockNodeReport.createInstances(hosts, state, label))
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/2606192a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
index d9cfddb..c8a82bd 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
@@ -415,7 +415,7 @@ class TestRoleHistoryContainerEvents extends BaseMockAppStateTest {
     // as even unused nodes are added to the list, we expect the map size to be >1
     assert startSize <= endSize
     assert nodemap[hostname] != null
-    assert roleHistory.cloneFailedNodes().contains(hostname)
+    assert !nodemap[hostname].online
 
     // add a failure of a node we've never head of
     def newhost = "newhost"
@@ -428,9 +428,8 @@ class TestRoleHistoryContainerEvents extends BaseMockAppStateTest {
     roleHistory.onNodesUpdated(nodesUpdated)
 
     def nodemap2 = roleHistory.cloneNodemap()
-    assert nodemap2.size() > endSize
-    assert roleHistory.cloneFailedNodes().contains(newhost)
     assert nodemap2[newhost]
+    assert !nodemap2[newhost].online
 
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/2606192a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/MockNodeReport.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/MockNodeReport.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/MockNodeReport.groovy
index 1c7a816..43eef3e 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/MockNodeReport.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/mock/MockNodeReport.groovy
@@ -18,6 +18,7 @@
 
 package org.apache.slider.server.appmaster.model.mock
 
+import groovy.transform.CompileStatic
 import org.apache.hadoop.yarn.api.records.NodeId
 import org.apache.hadoop.yarn.api.records.NodeReport
 import org.apache.hadoop.yarn.api.records.NodeState
@@ -26,6 +27,7 @@ import org.apache.hadoop.yarn.api.records.Resource
 /**
  * Node report for testing
  */
+@CompileStatic
 class MockNodeReport extends NodeReport {
   NodeId nodeId;
   NodeState nodeState;
@@ -37,4 +39,36 @@ class MockNodeReport extends NodeReport {
   String healthReport;
   long lastHealthReportTime;
   Set<String> nodeLabels;
+
+  MockNodeReport() {
+  }
+
+  /**
+   * Create a single instance
+   * @param hostname
+   * @param nodeState
+   * @param label
+   */
+  MockNodeReport(String hostname, NodeState nodeState, String label ="") {
+    nodeId = NodeId.newInstance(hostname, 80)
+    this.nodeState = nodeState
+    this.httpAddress = "http$hostname:80"
+    this.nodeLabels = new HashSet<>()
+    nodeLabels.add(label)
+  }
+
+  /**
+   * Create a list of instances -one for each hostname
+   * @param hostnames hosts
+   * @param nodeState state of all of them
+   * @param label label for all of them
+   * @return
+   */
+  static List<MockNodeReport> createInstances(
+      List<String> hostnames,
+      NodeState nodeState = NodeState.RUNNING,
+      String label = "") {
+    hostnames.collect { String  name ->
+      new MockNodeReport(name, nodeState, label)}
+  }
 }