You are viewing a plain text version of this content. The canonical link for it is here.
Posted to yarn-commits@hadoop.apache.org by tg...@apache.org on 2013/01/14 16:59:56 UTC

svn commit: r1432965 - in /hadoop/common/branches/branch-0.23/hadoop-yarn-project: ./ hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ hadoop-yarn/hadoop-yarn-server/hadoop-yarn-serv...

Author: tgraves
Date: Mon Jan 14 15:59:55 2013
New Revision: 1432965

URL: http://svn.apache.org/viewvc?rev=1432965&view=rev
Log:
YARN-170. NodeManager stop() gets called twice on shutdown (Sandy Ryza via tgraves)

Added:
    hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerEvent.java
      - copied unchanged from r1429796, hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerEvent.java
    hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerEventType.java
      - copied unchanged from r1429796, hadoop/common/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerEventType.java
Modified:
    hadoop/common/branches/branch-0.23/hadoop-yarn-project/CHANGES.txt
    hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
    hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java
    hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java

Modified: hadoop/common/branches/branch-0.23/hadoop-yarn-project/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-yarn-project/CHANGES.txt?rev=1432965&r1=1432964&r2=1432965&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-yarn-project/CHANGES.txt (original)
+++ hadoop/common/branches/branch-0.23/hadoop-yarn-project/CHANGES.txt Mon Jan 14 15:59:55 2013
@@ -54,6 +54,9 @@ Release 0.23.6 - UNRELEASED
 
     YARN-334. Maven RAT plugin is not checking all source files (tgraves)
 
+    YARN-170. NodeManager stop() gets called twice on shutdown (Sandy Ryza
+    via tgraves)
+
 Release 0.23.5 - UNRELEASED
 
   INCOMPATIBLE CHANGES

Modified: hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java?rev=1432965&r1=1432964&r2=1432965&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java Mon Jan 14 15:59:55 2013
@@ -25,6 +25,7 @@ import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
 import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -44,6 +45,7 @@ import org.apache.hadoop.yarn.api.record
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.event.AsyncDispatcher;
 import org.apache.hadoop.yarn.event.Dispatcher;
+import org.apache.hadoop.yarn.event.EventHandler;
 import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
@@ -54,11 +56,10 @@ import org.apache.hadoop.yarn.server.nod
 import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
 import org.apache.hadoop.yarn.service.CompositeService;
 import org.apache.hadoop.yarn.service.Service;
-import org.apache.hadoop.yarn.service.ServiceStateChangeListener;
 import org.apache.hadoop.yarn.util.Records;
 
-public class NodeManager extends CompositeService implements
-    ServiceStateChangeListener {
+public class NodeManager extends CompositeService 
+    implements EventHandler<NodeManagerEvent> {
 
   /**
    * Priority of the NodeManager shutdown hook.
@@ -82,6 +83,8 @@ public class NodeManager extends Composi
   
   private long waitForContainersOnShutdownMillis;
   
+  private AtomicBoolean isStopping = new AtomicBoolean(false);
+  
   public NodeManager() {
     super(NodeManager.class.getName());
   }
@@ -152,7 +155,6 @@ public class NodeManager extends Composi
 
     NodeStatusUpdater nodeStatusUpdater =
         createNodeStatusUpdater(context, dispatcher, nodeHealthChecker);
-    nodeStatusUpdater.register(this);
 
     NodeResourceMonitor nodeResourceMonitor = createNodeResourceMonitor();
     addService(nodeResourceMonitor);
@@ -167,6 +169,7 @@ public class NodeManager extends Composi
     addService(webServer);
 
     dispatcher.register(ContainerManagerEventType.class, containerManager);
+    dispatcher.register(NodeManagerEventType.class, this);
     addService(dispatcher);
     
     DefaultMetricsSystem.initialize("NodeManager");
@@ -198,13 +201,17 @@ public class NodeManager extends Composi
 
   @Override
   public void stop() {
+    if (isStopping.getAndSet(true)) {
+      return;
+    }
+    
     cleanupContainers();
     super.stop();
     DefaultMetricsSystem.shutdown();
   }
   
   @SuppressWarnings("unchecked")
-  private void cleanupContainers() {
+  protected void cleanupContainers() {
     Map<ContainerId, Container> containers = context.getContainers();
     if (containers.isEmpty()) {
       return;
@@ -293,24 +300,10 @@ public class NodeManager extends Composi
     return nodeHealthChecker;
   }
 
-  @Override
-  public void stateChanged(Service service) {
-    if (NodeStatusUpdaterImpl.class.getName().equals(service.getName())
-        && STATE.STOPPED.equals(service.getServiceState())) {
-
-      boolean hasToReboot = ((NodeStatusUpdaterImpl) service).hasToRebootNode();
-
-      // Shutdown the Nodemanager when the NodeStatusUpdater is stopped.      
-      stop();
-
-      // Reboot the whole node-manager if NodeStatusUpdater got a reboot command
-      // from the RM.
-      if (hasToReboot) {
-        LOG.info("Rebooting the node manager.");
-        NodeManager nodeManager = createNewNodeManager();
-        nodeManager.initAndStartNodeManager(this.getConfig(), hasToReboot);
-      }
-    }
+  private void reboot() {
+    LOG.info("Rebooting the node manager.");
+    NodeManager nodeManager = createNewNodeManager();
+    nodeManager.initAndStartNodeManager(this.getConfig(), true);
   }
   
   private void initAndStartNodeManager(Configuration conf, boolean hasToReboot) {
@@ -333,6 +326,21 @@ public class NodeManager extends Composi
     }
   }
 
+  @Override
+  public void handle(NodeManagerEvent event) {
+    switch (event.getType()) {
+    case SHUTDOWN:
+      stop();
+      break;
+    case REBOOT:
+      stop();
+      reboot();
+      break;
+    default:
+      LOG.warn("Invalid shutdown event " + event.getType() + ". Ignoring.");
+    }
+  }
+  
   // For testing
   NodeManager createNewNodeManager() {
     return new NodeManager();

Modified: hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java?rev=1432965&r1=1432964&r2=1432965&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java Mon Jan 14 15:59:55 2013
@@ -88,8 +88,6 @@ public class NodeStatusUpdaterImpl exten
   private final NodeHealthCheckerService healthChecker;
   private final NodeManagerMetrics metrics;
 
-  private boolean hasToRebootNode;
-  
   public NodeStatusUpdaterImpl(Context context, Dispatcher dispatcher,
       NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) {
     super(NodeStatusUpdaterImpl.class.getName());
@@ -149,18 +147,6 @@ public class NodeStatusUpdaterImpl exten
     super.stop();
   }
   
-  private synchronized void reboot() {
-    this.hasToRebootNode = true;
-    // Stop the status-updater. This will trigger a sub-service state change in
-    // the NodeManager which will then decide to reboot or not based on
-    // isRebooted.
-    this.stop();
-  }
-
-  synchronized boolean hasToRebootNode() {
-    return this.hasToRebootNode;
-  }
-
   private boolean isSecurityEnabled() {
     return UserGroupInformation.isSecurityEnabled();
   }
@@ -348,13 +334,15 @@ public class NodeStatusUpdaterImpl exten
               LOG
                   .info("Recieved SHUTDOWN signal from Resourcemanager as part of heartbeat," +
                   		" hence shutting down.");
-              NodeStatusUpdaterImpl.this.stop();
+              dispatcher.getEventHandler().handle(
+                  new NodeManagerEvent(NodeManagerEventType.SHUTDOWN));
               break;
             }
             if (response.getNodeAction() == NodeAction.REBOOT) {
               LOG.info("Node is out of sync with ResourceManager,"
                   + " hence rebooting.");
-              NodeStatusUpdaterImpl.this.reboot();
+              dispatcher.getEventHandler().handle(
+                  new NodeManagerEvent(NodeManagerEventType.REBOOT));
               break;
             }
 

Modified: hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java?rev=1432965&r1=1432964&r2=1432965&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java (original)
+++ hadoop/common/branches/branch-0.23/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java Mon Jan 14 15:59:55 2013
@@ -29,6 +29,7 @@ import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -97,7 +98,7 @@ public class TestNodeStatusUpdater {
   public void tearDown() {
     this.registeredNodes.clear();
     heartBeatID = 0;
-    if (nm != null) {
+    if (nm != null && nm.getServiceState() == STATE.STARTED) {
       nm.stop();
     }
     DefaultMetricsSystem.shutdown();
@@ -447,6 +448,52 @@ public class TestNodeStatusUpdater {
   }
   
   @Test
+  public void testStopReentrant() throws Exception {
+    final AtomicInteger numCleanups = new AtomicInteger(0);
+    nm = new NodeManager() {
+      @Override
+      protected NodeStatusUpdater createNodeStatusUpdater(Context context,
+          Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
+        MyNodeStatusUpdater myNodeStatusUpdater = new MyNodeStatusUpdater(
+            context, dispatcher, healthChecker, metrics);
+        MyResourceTracker2 myResourceTracker2 = new MyResourceTracker2();
+        myResourceTracker2.heartBeatNodeAction = NodeAction.SHUTDOWN;
+        myNodeStatusUpdater.resourceTracker = myResourceTracker2;
+        return myNodeStatusUpdater;
+      }
+      
+      @Override
+      protected void cleanupContainers() {
+        super.cleanupContainers();
+        numCleanups.incrementAndGet();
+      }
+    };
+
+    YarnConfiguration conf = createNMConfig();
+    nm.init(conf);
+    nm.start();
+    
+    int waitCount = 0;
+    while (heartBeatID < 1 && waitCount++ != 20) {
+      Thread.sleep(500);
+    }
+    Assert.assertFalse(heartBeatID < 1);
+
+    // Meanwhile call stop directly as the shutdown hook would
+    nm.stop();
+    
+    // NM takes a while to reach the STOPPED state.
+    waitCount = 0;
+    while (nm.getServiceState() != STATE.STOPPED && waitCount++ != 20) {
+      LOG.info("Waiting for NM to stop..");
+      Thread.sleep(1000);
+    }
+
+    Assert.assertEquals(STATE.STOPPED, nm.getServiceState());
+    Assert.assertEquals(numCleanups.get(), 1);
+  }
+  
+  @Test
   public void testNodeDecommision() throws Exception {
     nm = getNodeManager(NodeAction.SHUTDOWN);
     YarnConfiguration conf = createNMConfig();