You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by sn...@apache.org on 2019/11/26 13:28:38 UTC
[hadoop] branch branch-3.1 updated: YARN-9011. Race condition
during decommissioning. Contributed by Peter Bacsko
This is an automated email from the ASF dual-hosted git repository.
snemeth pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/hadoop.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 80a97e6 YARN-9011. Race condition during decommissioning. Contributed by Peter Bacsko
80a97e6 is described below
commit 80a97e6ac2b77f85f38fb0450fefa95ddc1c2051
Author: Szilard Nemeth <sn...@apache.org>
AuthorDate: Tue Nov 26 14:27:55 2019 +0100
YARN-9011. Race condition during decommissioning. Contributed by Peter Bacsko
---
.../org/apache/hadoop/util/HostsFileReader.java | 35 ++++++++++++-
.../apache/hadoop/util/TestHostsFileReader.java | 61 +++++++++++++++++++++-
.../server/resourcemanager/NodesListManager.java | 43 ++++++++++++---
.../resourcemanager/ResourceTrackerService.java | 13 +++--
4 files changed, 140 insertions(+), 12 deletions(-)
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/HostsFileReader.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/HostsFileReader.java
index 67b0247..2ee8837 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/HostsFileReader.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/HostsFileReader.java
@@ -51,6 +51,8 @@ public class HostsFileReader {
.class);
private final AtomicReference<HostDetails> current;
+ private final AtomicReference<HostDetails> lazyLoaded =
+ new AtomicReference<>();
public HostsFileReader(String inFile,
String exFile) throws IOException {
@@ -186,7 +188,18 @@ public class HostsFileReader {
public void refresh(String includesFile, String excludesFile)
throws IOException {
- LOG.info("Refreshing hosts (include/exclude) list");
+ refreshInternal(includesFile, excludesFile, false);
+ }
+
+ public void lazyRefresh(String includesFile, String excludesFile)
+ throws IOException {
+ refreshInternal(includesFile, excludesFile, true);
+ }
+
+ private void refreshInternal(String includesFile, String excludesFile,
+ boolean lazy) throws IOException {
+ LOG.info("Refreshing hosts (include/exclude) list (lazy refresh = {})",
+ lazy);
HostDetails oldDetails = current.get();
Set<String> newIncludes = oldDetails.includes;
Map<String, Integer> newExcludes = oldDetails.excludes;
@@ -202,7 +215,21 @@ public class HostsFileReader {
}
HostDetails newDetails = new HostDetails(includesFile, newIncludes,
excludesFile, newExcludes);
- current.set(newDetails);
+
+ if (lazy) {
+ lazyLoaded.set(newDetails);
+ } else {
+ current.set(newDetails);
+ }
+ }
+
+ public void finishRefresh() {
+ if (lazyLoaded.get() == null) {
+ throw new IllegalStateException(
+ "Cannot finish refresh - call lazyRefresh() first");
+ }
+ current.set(lazyLoaded.get());
+ lazyLoaded.set(null);
}
@Private
@@ -278,6 +305,10 @@ public class HostsFileReader {
return current.get();
}
+ public HostDetails getLazyLoadedHostDetails() {
+ return lazyLoaded.get();
+ }
+
public void setIncludesFile(String includesFile) {
LOG.info("Setting the includes file to " + includesFile);
HostDetails oldDetails = current.get();
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestHostsFileReader.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestHostsFileReader.java
index 2462114..856a6a0 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestHostsFileReader.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestHostsFileReader.java
@@ -20,6 +20,7 @@ package org.apache.hadoop.util;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
+import java.io.IOException;
import java.util.Map;
import org.apache.hadoop.test.GenericTestUtils;
@@ -347,4 +348,62 @@ public class TestHostsFileReader {
assertTrue(excludes.get("host5") == 1800);
assertTrue(excludes.get("host6") == 1800);
}
-}
+
+ @Test
+ public void testLazyRefresh() throws IOException {
+ FileWriter efw = new FileWriter(excludesFile);
+ FileWriter ifw = new FileWriter(includesFile);
+
+ efw.write("host1\n");
+ efw.write("host2\n");
+ efw.close();
+ ifw.write("host3\n");
+ ifw.write("host4\n");
+ ifw.close();
+
+ HostsFileReader hfp = new HostsFileReader(includesFile, excludesFile);
+
+ ifw = new FileWriter(includesFile);
+ ifw.close();
+
+ efw = new FileWriter(excludesFile, true);
+ efw.write("host3\n");
+ efw.write("host4\n");
+ efw.close();
+
+ hfp.lazyRefresh(includesFile, excludesFile);
+
+ HostDetails details = hfp.getHostDetails();
+ HostDetails lazyDetails = hfp.getLazyLoadedHostDetails();
+
+ assertEquals("Details: no. of excluded hosts", 2,
+ details.getExcludedHosts().size());
+ assertEquals("Details: no. of included hosts", 2,
+ details.getIncludedHosts().size());
+ assertEquals("LazyDetails: no. of excluded hosts", 4,
+ lazyDetails.getExcludedHosts().size());
+ assertEquals("LayDetails: no. of included hosts", 0,
+ lazyDetails.getIncludedHosts().size());
+
+ hfp.finishRefresh();
+
+ details = hfp.getHostDetails();
+ assertEquals("Details: no. of excluded hosts", 4,
+ details.getExcludedHosts().size());
+ assertEquals("Details: no. of included hosts", 0,
+ details.getIncludedHosts().size());
+ assertNull("Lazy host details should be null",
+ hfp.getLazyLoadedHostDetails());
+ }
+
+ @Test(expected = IllegalStateException.class)
+ public void testFinishRefreshWithoutLazyRefresh() throws IOException {
+ FileWriter efw = new FileWriter(excludesFile);
+ FileWriter ifw = new FileWriter(includesFile);
+ efw.close();
+ ifw.close();
+
+ HostsFileReader hfp = new HostsFileReader(includesFile, excludesFile);
+ hfp.finishRefresh();
+ }
+}
\ No newline at end of file
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java
index 647dfa3..500d38d 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java
@@ -83,10 +83,12 @@ public class NodesListManager extends CompositeService implements
private Resolver resolver;
private Timer removalTimer;
private int nodeRemovalCheckInterval;
+ private Set<RMNode> gracefulDecommissionableNodes;
public NodesListManager(RMContext rmContext) {
super(NodesListManager.class.getName());
this.rmContext = rmContext;
+ this.gracefulDecommissionableNodes = ConcurrentHashMap.newKeySet();
}
@Override
@@ -114,7 +116,7 @@ public class NodesListManager extends CompositeService implements
this.hostsReader =
createHostsFileReader(this.includesFile, this.excludesFile);
setDecomissionedNMs();
- printConfiguredHosts();
+ printConfiguredHosts(false);
} catch (YarnException ex) {
disableHostsFileReader(ex);
} catch (IOException ioe) {
@@ -186,7 +188,7 @@ public class NodesListManager extends CompositeService implements
removalTimer.cancel();
}
- private void printConfiguredHosts() {
+ private void printConfiguredHosts(boolean graceful) {
if (!LOG.isDebugEnabled()) {
return;
}
@@ -197,7 +199,12 @@ public class NodesListManager extends CompositeService implements
conf.get(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH));
- HostDetails hostDetails = hostsReader.getHostDetails();
+ HostDetails hostDetails;
+ if (graceful) {
+ hostDetails = hostsReader.getLazyLoadedHostDetails();
+ } else {
+ hostDetails = hostsReader.getHostDetails();
+ }
for (String include : hostDetails.getIncludedHosts()) {
LOG.debug("include: " + include);
}
@@ -234,8 +241,15 @@ public class NodesListManager extends CompositeService implements
yarnConf.get(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH);
LOG.info("refreshNodes excludesFile " + excludesFile);
- hostsReader.refresh(includesFile, excludesFile);
- printConfiguredHosts();
+
+ if (graceful) {
+ // update hosts, but don't make it visible just yet
+ hostsReader.lazyRefresh(includesFile, excludesFile);
+ } else {
+ hostsReader.refresh(includesFile, excludesFile);
+ }
+
+ printConfiguredHosts(graceful);
LOG.info("hostsReader include:{" +
StringUtils.join(",", hostsReader.getHosts()) +
@@ -269,7 +283,14 @@ public class NodesListManager extends CompositeService implements
// Nodes need to be decommissioned (graceful or forceful);
List<RMNode> nodesToDecom = new ArrayList<RMNode>();
- HostDetails hostDetails = hostsReader.getHostDetails();
+ HostDetails hostDetails;
+ gracefulDecommissionableNodes.clear();
+ if (graceful) {
+ hostDetails = hostsReader.getLazyLoadedHostDetails();
+ } else {
+ hostDetails = hostsReader.getHostDetails();
+ }
+
Set<String> includes = hostDetails.getIncludedHosts();
Map<String, Integer> excludes = hostDetails.getExcludedMap();
@@ -297,11 +318,13 @@ public class NodesListManager extends CompositeService implements
s != NodeState.DECOMMISSIONING) {
LOG.info("Gracefully decommission " + nodeStr);
nodesToDecom.add(n);
+ gracefulDecommissionableNodes.add(n);
} else if (s == NodeState.DECOMMISSIONING &&
!Objects.equals(n.getDecommissioningTimeout(),
timeoutToUse)) {
LOG.info("Update " + nodeStr + " timeout to be " + timeoutToUse);
nodesToDecom.add(n);
+ gracefulDecommissionableNodes.add(n);
} else {
LOG.info("No action for " + nodeStr);
}
@@ -314,6 +337,10 @@ public class NodesListManager extends CompositeService implements
}
}
+ if (graceful) {
+ hostsReader.finishRefresh();
+ }
+
for (RMNode n : nodesToRecom) {
RMNodeEvent e = new RMNodeEvent(
n.getNodeID(), RMNodeEventType.RECOMMISSION);
@@ -465,6 +492,10 @@ public class NodesListManager extends CompositeService implements
hostDetails.getExcludedHosts());
}
+ boolean isGracefullyDecommissionableNode(RMNode node) {
+ return gracefulDecommissionableNodes.contains(node);
+ }
+
private boolean isValidNode(
String hostName, Set<String> hostsList, Set<String> excludeList) {
String ip = resolver.resolve(hostName);
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
index 2b957ca..ce39c43 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
@@ -731,10 +731,17 @@ public class ResourceTrackerService extends AbstractService implements
*/
private boolean isNodeInDecommissioning(NodeId nodeId) {
RMNode rmNode = this.rmContext.getRMNodes().get(nodeId);
- if (rmNode != null &&
- rmNode.getState().equals(NodeState.DECOMMISSIONING)) {
- return true;
+
+ if (rmNode != null) {
+ NodeState state = rmNode.getState();
+
+ if (state == NodeState.DECOMMISSIONING ||
+ (state == NodeState.RUNNING &&
+ this.nodesListManager.isGracefullyDecommissionableNode(rmNode))) {
+ return true;
+ }
}
+
return false;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org