You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by dh...@apache.org on 2010/06/16 23:21:46 UTC

svn commit: r955395 - in /hadoop/common/branches/branch-0.20-append: CHANGES.txt src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java src/test/org/apache/hadoop/hdfs/server/namenode/TestNodeCount.java

Author: dhruba
Date: Wed Jun 16 21:21:45 2010
New Revision: 955395

URL: http://svn.apache.org/viewvc?rev=955395&view=rev
Log:
HDFS-1215. Fix unti test TestNodeCount.
(Todd Lipcon via dhruba)


Modified:
    hadoop/common/branches/branch-0.20-append/CHANGES.txt
    hadoop/common/branches/branch-0.20-append/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java
    hadoop/common/branches/branch-0.20-append/src/test/org/apache/hadoop/hdfs/server/namenode/TestNodeCount.java

Modified: hadoop/common/branches/branch-0.20-append/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20-append/CHANGES.txt?rev=955395&r1=955394&r2=955395&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20-append/CHANGES.txt (original)
+++ hadoop/common/branches/branch-0.20-append/CHANGES.txt Wed Jun 16 21:21:45 2010
@@ -41,6 +41,9 @@ Release 0.20-append - Unreleased
     HDFS-927. DFSInputStream retries too many times for new block locations.
     (Todd Lipcon via dhruba)
 
+    HDFS-1215. Fix unit test TestNodeCount.
+    (Todd Lipcon via dhruba)
+
 Release 0.20.3 - Unreleased
 
   NEW FEATURES

Modified: hadoop/common/branches/branch-0.20-append/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20-append/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java?rev=955395&r1=955394&r2=955395&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20-append/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java (original)
+++ hadoop/common/branches/branch-0.20-append/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java Wed Jun 16 21:21:45 2010
@@ -731,6 +731,14 @@ public class MiniDFSCluster {
    * Wait until the cluster is active and running.
    */
   public void waitActive() throws IOException {
+    waitActive(true);
+  }
+
+  /**
+   * Wait until the cluster is active.
+   * @param waitHeartbeats if true, will wait until all DNs have heartbeat
+   */
+  public void waitActive(boolean waitHeartbeats) throws IOException {
     if (nameNode == null) {
       return;
     }
@@ -739,7 +747,8 @@ public class MiniDFSCluster {
     DFSClient client = new DFSClient(addr, conf);
 
     // make sure all datanodes are alive and sent heartbeat
-    while (shouldWait(client.datanodeReport(DatanodeReportType.LIVE))) {
+    while (shouldWait(client.datanodeReport(DatanodeReportType.LIVE),
+                      waitHeartbeats)) {
       try {
         Thread.sleep(100);
       } catch (InterruptedException e) {
@@ -749,10 +758,17 @@ public class MiniDFSCluster {
     client.close();
   }
 
-  private synchronized boolean shouldWait(DatanodeInfo[] dnInfo) {
+  private synchronized boolean shouldWait(DatanodeInfo[] dnInfo,
+                                          boolean waitHeartbeats) {
     if (dnInfo.length != numDataNodes) {
       return true;
     }
+
+    // If we don't need heartbeats, we're done.
+    if (!waitHeartbeats) {
+      return false;
+    }
+
     // make sure all datanodes have sent first heartbeat to namenode,
     // using (capacity == 0) as proxy.
     for (DatanodeInfo dn : dnInfo) {

Modified: hadoop/common/branches/branch-0.20-append/src/test/org/apache/hadoop/hdfs/server/namenode/TestNodeCount.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20-append/src/test/org/apache/hadoop/hdfs/server/namenode/TestNodeCount.java?rev=955395&r1=955394&r2=955395&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20-append/src/test/org/apache/hadoop/hdfs/server/namenode/TestNodeCount.java (original)
+++ hadoop/common/branches/branch-0.20-append/src/test/org/apache/hadoop/hdfs/server/namenode/TestNodeCount.java Wed Jun 16 21:21:45 2010
@@ -3,6 +3,8 @@ package org.apache.hadoop.hdfs.server.na
 import java.util.Collection;
 import java.util.Iterator;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -19,6 +21,8 @@ import junit.framework.TestCase;
  * so NN makes right decision for under/over-replicated blocks
  */
 public class TestNodeCount extends TestCase {
+  static final Log LOG = LogFactory.getLog(TestNodeCount.class);
+
   public void testNodeCount() throws Exception {
     // start a mini dfs cluster of 2 nodes
     final Configuration conf = new Configuration();
@@ -41,8 +45,9 @@ public class TestNodeCount extends TestC
       
       // start two new nodes
       cluster.startDataNodes(conf, 2, true, null, null);
-      cluster.waitActive();
+      cluster.waitActive(false);
       
+      LOG.info("Bringing down first DN");
       // bring down first datanode
       DatanodeDescriptor datanode = datanodes[0];
       DataNodeProperties dnprop = cluster.stopDataNode(datanode.getName());
@@ -51,21 +56,22 @@ public class TestNodeCount extends TestC
         datanode.setLastUpdate(0); // mark it dead
         namesystem.heartbeatCheck();
       }
+
+      LOG.info("Waiting for block to be replicated");
       // the block will be replicated
       DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);
 
+      LOG.info("Restarting first datanode");
       // restart the first datanode
       cluster.restartDataNode(dnprop);
-      cluster.waitActive();
-      
+      cluster.waitActive(false);
+
+      LOG.info("Waiting for excess replicas to be detected");
+
       // check if excessive replica is detected
-      NumberReplicas num = null;
-      do {
-       synchronized (namesystem) {
-         num = namesystem.countNodes(block);
-       }
-      } while (num.excessReplicas() == 0);
-      
+      waitForExcessReplicasToChange(namesystem, block, 0);
+
+      LOG.info("Finding a non-excess node");
       // find out a non-excess node
       Iterator<DatanodeDescriptor> iter = namesystem.blocksMap.nodeIterator(block);
       DatanodeDescriptor nonExcessDN = null;
@@ -78,7 +84,8 @@ public class TestNodeCount extends TestC
         }
       }
       assertTrue(nonExcessDN!=null);
-      
+
+      LOG.info("Stopping non-excess node: " + nonExcessDN);
       // bring down non excessive datanode
       dnprop = cluster.stopDataNode(nonExcessDN.getName());
       // make sure that NN detects that the datanode is down
@@ -86,22 +93,46 @@ public class TestNodeCount extends TestC
         nonExcessDN.setLastUpdate(0); // mark it dead
         namesystem.heartbeatCheck();
       }
-      
+
+      LOG.info("Waiting for live replicas to hit repl factor");
       // The block should be replicated
+      NumberReplicas num;
       do {
         num = namesystem.countNodes(block);
       } while (num.liveReplicas() != REPLICATION_FACTOR);
       
+      LOG.info("Restarting first DN");
       // restart the first datanode
       cluster.restartDataNode(dnprop);
-      cluster.waitActive();
-      
+      cluster.waitActive(false);
+
+      LOG.info("Waiting for excess replicas to be detected");
       // check if excessive replica is detected
-      do {
-       num = namesystem.countNodes(block);
-      } while (num.excessReplicas() == 2);
+      waitForExcessReplicasToChange(namesystem, block, 2);
     } finally {
       cluster.shutdown();
     }
   }
+
+  private void waitForExcessReplicasToChange(
+    FSNamesystem namesystem,
+    Block block,
+    int oldReplicas) throws Exception
+  {
+    NumberReplicas num;
+    long startChecking = System.currentTimeMillis();
+    do {
+      synchronized (namesystem) {
+        num = namesystem.countNodes(block);
+      }
+      Thread.sleep(100);
+      if (System.currentTimeMillis() - startChecking > 30000) {
+        namesystem.metaSave("TestNodeCount.meta");
+        LOG.warn("Dumping meta into log directory");
+        fail("Timed out waiting for excess replicas to change");
+      }
+
+    } while (num.excessReplicas() == oldReplicas);
+  }
+    
 }