You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by sn...@apache.org on 2019/11/26 13:28:38 UTC

[hadoop] branch branch-3.1 updated: YARN-9011. Race condition during decommissioning. Contributed by Peter Bacsko

This is an automated email from the ASF dual-hosted git repository.

snemeth pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new 80a97e6  YARN-9011. Race condition during decommissioning. Contributed by Peter Bacsko
80a97e6 is described below

commit 80a97e6ac2b77f85f38fb0450fefa95ddc1c2051
Author: Szilard Nemeth <sn...@apache.org>
AuthorDate: Tue Nov 26 14:27:55 2019 +0100

    YARN-9011. Race condition during decommissioning. Contributed by Peter Bacsko
---
 .../org/apache/hadoop/util/HostsFileReader.java    | 35 ++++++++++++-
 .../apache/hadoop/util/TestHostsFileReader.java    | 61 +++++++++++++++++++++-
 .../server/resourcemanager/NodesListManager.java   | 43 ++++++++++++---
 .../resourcemanager/ResourceTrackerService.java    | 13 +++--
 4 files changed, 140 insertions(+), 12 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/HostsFileReader.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/HostsFileReader.java
index 67b0247..2ee8837 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/HostsFileReader.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/HostsFileReader.java
@@ -51,6 +51,8 @@ public class HostsFileReader {
       .class);
 
   private final AtomicReference<HostDetails> current;
+  private final AtomicReference<HostDetails> lazyLoaded =
+      new AtomicReference<>();
 
   public HostsFileReader(String inFile,
                          String exFile) throws IOException {
@@ -186,7 +188,18 @@ public class HostsFileReader {
 
   public void refresh(String includesFile, String excludesFile)
       throws IOException {
-    LOG.info("Refreshing hosts (include/exclude) list");
+    refreshInternal(includesFile, excludesFile, false);
+  }
+
+  public void lazyRefresh(String includesFile, String excludesFile)
+      throws IOException {
+    refreshInternal(includesFile, excludesFile, true);
+  }
+
+  private void refreshInternal(String includesFile, String excludesFile,
+      boolean lazy) throws IOException {
+    LOG.info("Refreshing hosts (include/exclude) list (lazy refresh = {})",
+        lazy);
     HostDetails oldDetails = current.get();
     Set<String> newIncludes = oldDetails.includes;
     Map<String, Integer> newExcludes = oldDetails.excludes;
@@ -202,7 +215,21 @@ public class HostsFileReader {
     }
     HostDetails newDetails = new HostDetails(includesFile, newIncludes,
         excludesFile, newExcludes);
-    current.set(newDetails);
+
+    if (lazy) {
+      lazyLoaded.set(newDetails);
+    } else {
+      current.set(newDetails);
+    }
+  }
+
+  public void finishRefresh() {
+    if (lazyLoaded.get() == null) {
+      throw new IllegalStateException(
+          "Cannot finish refresh - call lazyRefresh() first");
+    }
+    current.set(lazyLoaded.get());
+    lazyLoaded.set(null);
   }
 
   @Private
@@ -278,6 +305,10 @@ public class HostsFileReader {
     return current.get();
   }
 
+  public HostDetails getLazyLoadedHostDetails() {
+    return lazyLoaded.get();
+  }
+
   public void setIncludesFile(String includesFile) {
     LOG.info("Setting the includes file to " + includesFile);
     HostDetails oldDetails = current.get();
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestHostsFileReader.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestHostsFileReader.java
index 2462114..856a6a0 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestHostsFileReader.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestHostsFileReader.java
@@ -20,6 +20,7 @@ package org.apache.hadoop.util;
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.FileWriter;
+import java.io.IOException;
 import java.util.Map;
 
 import org.apache.hadoop.test.GenericTestUtils;
@@ -347,4 +348,62 @@ public class TestHostsFileReader {
     assertTrue(excludes.get("host5") == 1800);
     assertTrue(excludes.get("host6") == 1800);
   }
-}
+
+  @Test
+  public void testLazyRefresh() throws IOException {
+    FileWriter efw = new FileWriter(excludesFile);
+    FileWriter ifw = new FileWriter(includesFile);
+
+    efw.write("host1\n");
+    efw.write("host2\n");
+    efw.close();
+    ifw.write("host3\n");
+    ifw.write("host4\n");
+    ifw.close();
+
+    HostsFileReader hfp = new HostsFileReader(includesFile, excludesFile);
+
+    ifw = new FileWriter(includesFile);
+    ifw.close();
+
+    efw = new FileWriter(excludesFile, true);
+    efw.write("host3\n");
+    efw.write("host4\n");
+    efw.close();
+
+    hfp.lazyRefresh(includesFile, excludesFile);
+
+    HostDetails details = hfp.getHostDetails();
+    HostDetails lazyDetails = hfp.getLazyLoadedHostDetails();
+
+    assertEquals("Details: no. of excluded hosts", 2,
+        details.getExcludedHosts().size());
+    assertEquals("Details: no. of included hosts", 2,
+        details.getIncludedHosts().size());
+    assertEquals("LazyDetails: no. of excluded hosts", 4,
+        lazyDetails.getExcludedHosts().size());
+    assertEquals("LayDetails: no. of included hosts", 0,
+        lazyDetails.getIncludedHosts().size());
+
+    hfp.finishRefresh();
+
+    details = hfp.getHostDetails();
+    assertEquals("Details: no. of excluded hosts", 4,
+        details.getExcludedHosts().size());
+    assertEquals("Details: no. of included hosts", 0,
+        details.getIncludedHosts().size());
+    assertNull("Lazy host details should be null",
+        hfp.getLazyLoadedHostDetails());
+  }
+
+  @Test(expected = IllegalStateException.class)
+  public void testFinishRefreshWithoutLazyRefresh() throws IOException {
+    FileWriter efw = new FileWriter(excludesFile);
+    FileWriter ifw = new FileWriter(includesFile);
+    efw.close();
+    ifw.close();
+
+    HostsFileReader hfp = new HostsFileReader(includesFile, excludesFile);
+    hfp.finishRefresh();
+  }
+}
\ No newline at end of file
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java
index 647dfa3..500d38d 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java
@@ -83,10 +83,12 @@ public class NodesListManager extends CompositeService implements
   private Resolver resolver;
   private Timer removalTimer;
   private int nodeRemovalCheckInterval;
+  private Set<RMNode> gracefulDecommissionableNodes;
 
   public NodesListManager(RMContext rmContext) {
     super(NodesListManager.class.getName());
     this.rmContext = rmContext;
+    this.gracefulDecommissionableNodes = ConcurrentHashMap.newKeySet();
   }
 
   @Override
@@ -114,7 +116,7 @@ public class NodesListManager extends CompositeService implements
       this.hostsReader =
           createHostsFileReader(this.includesFile, this.excludesFile);
       setDecomissionedNMs();
-      printConfiguredHosts();
+      printConfiguredHosts(false);
     } catch (YarnException ex) {
       disableHostsFileReader(ex);
     } catch (IOException ioe) {
@@ -186,7 +188,7 @@ public class NodesListManager extends CompositeService implements
     removalTimer.cancel();
   }
 
-  private void printConfiguredHosts() {
+  private void printConfiguredHosts(boolean graceful) {
     if (!LOG.isDebugEnabled()) {
       return;
     }
@@ -197,7 +199,12 @@ public class NodesListManager extends CompositeService implements
         conf.get(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
             YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH));
 
-    HostDetails hostDetails = hostsReader.getHostDetails();
+    HostDetails hostDetails;
+    if (graceful) {
+      hostDetails = hostsReader.getLazyLoadedHostDetails();
+    } else {
+      hostDetails = hostsReader.getHostDetails();
+    }
     for (String include : hostDetails.getIncludedHosts()) {
       LOG.debug("include: " + include);
     }
@@ -234,8 +241,15 @@ public class NodesListManager extends CompositeService implements
         yarnConf.get(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
             YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH);
     LOG.info("refreshNodes excludesFile " + excludesFile);
-    hostsReader.refresh(includesFile, excludesFile);
-    printConfiguredHosts();
+
+    if (graceful) {
+      // update hosts, but don't make it visible just yet
+      hostsReader.lazyRefresh(includesFile, excludesFile);
+    } else {
+      hostsReader.refresh(includesFile, excludesFile);
+    }
+
+    printConfiguredHosts(graceful);
 
     LOG.info("hostsReader include:{" +
         StringUtils.join(",", hostsReader.getHosts()) +
@@ -269,7 +283,14 @@ public class NodesListManager extends CompositeService implements
     // Nodes need to be decommissioned (graceful or forceful);
     List<RMNode> nodesToDecom = new ArrayList<RMNode>();
 
-    HostDetails hostDetails = hostsReader.getHostDetails();
+    HostDetails hostDetails;
+    gracefulDecommissionableNodes.clear();
+    if (graceful) {
+      hostDetails = hostsReader.getLazyLoadedHostDetails();
+    } else {
+      hostDetails = hostsReader.getHostDetails();
+    }
+
     Set<String> includes = hostDetails.getIncludedHosts();
     Map<String, Integer> excludes = hostDetails.getExcludedMap();
 
@@ -297,11 +318,13 @@ public class NodesListManager extends CompositeService implements
               s != NodeState.DECOMMISSIONING) {
             LOG.info("Gracefully decommission " + nodeStr);
             nodesToDecom.add(n);
+            gracefulDecommissionableNodes.add(n);
           } else if (s == NodeState.DECOMMISSIONING &&
                      !Objects.equals(n.getDecommissioningTimeout(),
                          timeoutToUse)) {
             LOG.info("Update " + nodeStr + " timeout to be " + timeoutToUse);
             nodesToDecom.add(n);
+            gracefulDecommissionableNodes.add(n);
           } else {
             LOG.info("No action for " + nodeStr);
           }
@@ -314,6 +337,10 @@ public class NodesListManager extends CompositeService implements
       }
     }
 
+    if (graceful) {
+      hostsReader.finishRefresh();
+    }
+
     for (RMNode n : nodesToRecom) {
       RMNodeEvent e = new RMNodeEvent(
           n.getNodeID(), RMNodeEventType.RECOMMISSION);
@@ -465,6 +492,10 @@ public class NodesListManager extends CompositeService implements
         hostDetails.getExcludedHosts());
   }
 
+  boolean isGracefullyDecommissionableNode(RMNode node) {
+    return gracefulDecommissionableNodes.contains(node);
+  }
+
   private boolean isValidNode(
       String hostName, Set<String> hostsList, Set<String> excludeList) {
     String ip = resolver.resolve(hostName);
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
index 2b957ca..ce39c43 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
@@ -731,10 +731,17 @@ public class ResourceTrackerService extends AbstractService implements
    */
   private boolean isNodeInDecommissioning(NodeId nodeId) {
     RMNode rmNode = this.rmContext.getRMNodes().get(nodeId);
-    if (rmNode != null &&
-        rmNode.getState().equals(NodeState.DECOMMISSIONING)) {
-      return true;
+
+    if (rmNode != null) {
+      NodeState state = rmNode.getState();
+
+      if (state == NodeState.DECOMMISSIONING ||
+          (state == NodeState.RUNNING &&
+          this.nodesListManager.isGracefullyDecommissionableNode(rmNode))) {
+        return true;
+      }
     }
+
     return false;
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org