You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@slider.apache.org by go...@apache.org on 2014/10/23 06:25:12 UTC
[1/2] git commit: SLIDER-439 RM never fulfilled Slider AM's container
request after NM died on a node where HRegionServer was running
Repository: incubator-slider
Updated Branches:
refs/heads/develop 714a335df -> ec07bdc12
SLIDER-439 RM never fulfilled Slider AM's container request after NM died on a node where HRegionServer was running
Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/34b909a8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/34b909a8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/34b909a8
Branch: refs/heads/develop
Commit: 34b909a8edb551b6e9aa7f5ab2b3f6bd04f1b7c5
Parents: 6da903d
Author: Gour Saha <gs...@hortonworks.com>
Authored: Wed Oct 22 21:19:52 2014 -0700
Committer: Gour Saha <gs...@hortonworks.com>
Committed: Wed Oct 22 21:19:52 2014 -0700
----------------------------------------------------------------------
.../server/appmaster/SliderAppMaster.java | 5 +-
.../slider/server/appmaster/state/AppState.java | 5 ++
.../server/appmaster/state/RoleHistory.java | 45 +++++++++++++++-
.../TestRoleHistoryContainerEvents.groovy | 55 ++++++++++++++++++++
4 files changed, 108 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/34b909a8/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
index 06d3597..b3c4b4c 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/SliderAppMaster.java
@@ -1514,7 +1514,10 @@ public class SliderAppMaster extends AbstractSliderLaunchedService
*/
@Override //AMRMClientAsync
public void onNodesUpdated(List<NodeReport> updatedNodes) {
- LOG_YARN.info("Nodes updated");
+ LOG_YARN.info("onNodesUpdated({})", updatedNodes.size());
+ log.info("Updated nodes {}", updatedNodes);
+ // Check if any nodes are lost or revived and update state accordingly
+ appState.onNodesUpdated(updatedNodes);
}
/**
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/34b909a8/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
index 706b0d2..db119bd 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/AppState.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.NodeId;
+import org.apache.hadoop.yarn.api.records.NodeReport;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.impl.pb.ContainerPBImpl;
import org.apache.hadoop.yarn.client.api.AMRMClient;
@@ -1242,6 +1243,10 @@ public class AppState {
}
}
+ public synchronized void onNodesUpdated(List<NodeReport> updatedNodes) {
+ roleHistory.onNodesUpdated(updatedNodes);
+ }
+
/**
* Is a role short lived by the threshold set for this application
* @param instance instance
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/34b909a8/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
----------------------------------------------------------------------
diff --git a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
index dca7384..9aca32f 100644
--- a/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
+++ b/slider-core/src/main/java/org/apache/slider/server/appmaster/state/RoleHistory.java
@@ -22,6 +22,7 @@ import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.yarn.api.records.Container;
+import org.apache.hadoop.yarn.api.records.NodeReport;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.client.api.AMRMClient;
import org.apache.slider.common.tools.SliderUtils;
@@ -88,6 +89,14 @@ public class RoleHistory {
*/
private Map<Integer, LinkedList<NodeInstance>> availableNodes;
+ /**
+ * Track the failed nodes. Currently used to make wiser decision of container
+ * ask with/without locality. Has other potential uses as well.
+ */
+ private Map<String, Object> failedNodes = new HashMap<String, Object>();
+ // dummy to be used in maps for faster lookup where we don't care about values
+ private final Object DUMMY_VALUE = new Object();
+
public RoleHistory(List<ProviderRole> providerRoles) throws
BadConfigException {
this.providerRoles = providerRoles;
@@ -660,6 +669,28 @@ public class RoleHistory {
}
/**
+ * Update failedNodes and nodemap based on the node state
+ *
+ * @param updatedNodes list of updated nodes
+ */
+ public synchronized void onNodesUpdated(List<NodeReport> updatedNodes) {
+ for (NodeReport updatedNode : updatedNodes) {
+ String hostname = updatedNode.getNodeId() == null ? null : updatedNode
+ .getNodeId().getHost();
+ if (hostname == null) {
+ continue;
+ }
+ if (updatedNode.getNodeState() != null
+ && updatedNode.getNodeState().isUnusable()) {
+ failedNodes.put(hostname, DUMMY_VALUE);
+ nodemap.remove(hostname);
+ } else {
+ failedNodes.remove(hostname);
+ }
+ }
+ }
+
+ /**
* A container release request was issued
* @param container container submitted
*/
@@ -710,7 +741,11 @@ public class RoleHistory {
available = false;
} else {
available = nodeEntry.containerCompleted(wasReleased);
- maybeQueueNodeForWork(container, nodeEntry, available);
+ boolean isFailedNode = failedNodes.containsKey(RoleHistoryUtils
+ .hostnameOf(container));
+ if (!isFailedNode) {
+ maybeQueueNodeForWork(container, nodeEntry, available);
+ }
}
touch();
return available;
@@ -775,5 +810,13 @@ public class RoleHistory {
return outstandingRequests.listOutstandingRequests();
}
+ /**
+ * Get a clone of the failedNodes
+ *
+ * @return the list
+ */
+ public List<String> cloneFailedNodes() {
+ return new ArrayList<String>(failedNodes.keySet());
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/34b909a8/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
----------------------------------------------------------------------
diff --git a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
index 340e72d..dbb70fa 100644
--- a/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
+++ b/slider-core/src/test/groovy/org/apache/slider/server/appmaster/model/history/TestRoleHistoryContainerEvents.groovy
@@ -18,9 +18,14 @@
package org.apache.slider.server.appmaster.model.history
+import java.util.List;
+
import groovy.transform.CompileStatic
import groovy.util.logging.Slf4j
import org.apache.hadoop.yarn.api.records.Container
+import org.apache.hadoop.yarn.api.records.NodeId;
+import org.apache.hadoop.yarn.api.records.NodeReport;
+import org.apache.hadoop.yarn.api.records.NodeState;
import org.apache.hadoop.yarn.api.records.Priority
import org.apache.hadoop.yarn.api.records.Resource
import org.apache.hadoop.yarn.client.api.AMRMClient
@@ -365,4 +370,54 @@ class TestRoleHistoryContainerEvents extends BaseMockAppStateTest {
MockContainer c2 = (MockContainer) sortedContainers[1]
assert c2.priority.getPriority() == 1
}
+
+ @Test
+ public void testNodeUpdated() throws Throwable {
+ describe("fail a node")
+
+ int role = 0
+ ProviderRole provRole = new ProviderRole(roleName, role)
+ RoleStatus roleStatus = new RoleStatus(provRole)
+ AMRMClient.ContainerRequest request =
+ roleHistory.requestNode(roleStatus, resource);
+
+ String hostname = request.getNodes()[0]
+ assert hostname == age3Active0.hostname
+
+ // build a container
+ MockContainer container = factory.newContainer()
+ container.nodeId = new MockNodeId(hostname, 0)
+ container.priority = request.getPriority()
+ roleHistory.onContainerAssigned(container);
+
+ NodeMap nodemap = roleHistory.cloneNodemap();
+ NodeInstance allocated = nodemap.get(hostname)
+ NodeEntry roleEntry = allocated.get(role)
+ assert roleEntry.starting == 1
+ assert !roleEntry.available
+ RoleInstance ri = new RoleInstance(container);
+ // start it
+ roleHistory.onContainerStartSubmitted(container, ri)
+ roleHistory.onContainerStarted(container)
+
+ int startSize = nodemap.size()
+
+ // now send a list of updated (failed) nodes event
+ List<NodeReport> nodesUpdated = new ArrayList<NodeReport>();
+ NodeId nodeId = NodeId.newInstance(hostname, 0)
+ NodeReport nodeReport = NodeReport.newInstance(nodeId, NodeState.LOST, null, null, null, null, 1, null, 0)
+ nodesUpdated.add(nodeReport)
+ roleHistory.onNodesUpdated(nodesUpdated)
+
+ nodemap = roleHistory.cloneNodemap()
+ int endSize = nodemap.size()
+ if (startSize == 0) {
+ assert endSize == 0
+ } else {
+ assert startSize - endSize == 1
+ }
+ assert nodemap.get(hostname) == null
+ List<String> failedNodes = roleHistory.cloneFailedNodes()
+ assert failedNodes.contains(hostname)
+ }
}