You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hdfs-commits@hadoop.apache.org by el...@apache.org on 2011/04/27 07:32:42 UTC

svn commit: r1096995 - in /hadoop/hdfs/trunk: CHANGES.txt src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestNodeCount.java

Author: eli
Date: Wed Apr 27 05:32:41 2011
New Revision: 1096995

URL: http://svn.apache.org/viewvc?rev=1096995&view=rev
Log:
HDFS-1829. TestNodeCount waits forever, errs without giving information. Contributed by Matt Foley

Modified:
    hadoop/hdfs/trunk/CHANGES.txt
    hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestNodeCount.java

Modified: hadoop/hdfs/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hdfs/trunk/CHANGES.txt?rev=1096995&r1=1096994&r2=1096995&view=diff
==============================================================================
--- hadoop/hdfs/trunk/CHANGES.txt (original)
+++ hadoop/hdfs/trunk/CHANGES.txt Wed Apr 27 05:32:41 2011
@@ -223,6 +223,9 @@ Trunk (unreleased changes)
     HDFS-1808. TestBalancer waits forever, errs without giving information.
     (Matt Foley via eli)
 
+    HDFS-1829. TestNodeCount waits forever, errs without giving information.
+    (Matt Foley via eli)
+
 Release 0.22.0 - Unreleased
 
   NEW FEATURES

Modified: hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestNodeCount.java
URL: http://svn.apache.org/viewvc/hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestNodeCount.java?rev=1096995&r1=1096994&r2=1096995&view=diff
==============================================================================
--- hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestNodeCount.java (original)
+++ hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestNodeCount.java Wed Apr 27 05:32:41 2011
@@ -19,6 +19,7 @@ package org.apache.hadoop.hdfs.server.na
 
 import java.util.Collection;
 import java.util.Iterator;
+import java.util.concurrent.TimeoutException;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
@@ -35,12 +36,21 @@ import junit.framework.TestCase;
 /**
  * Test if live nodes count per node is correct 
  * so NN makes right decision for under/over-replicated blocks
+ * 
+ * Two of the "while" loops below use "busy wait"
+ * because they are detecting transient states.
  */
 public class TestNodeCount extends TestCase {
+  final short REPLICATION_FACTOR = (short)2;
+  final long TIMEOUT = 20000L;
+  long timeout = 0;
+  long failtime = 0;
+  Block lastBlock = null;
+  NumberReplicas lastNum = null;
+
   public void testNodeCount() throws Exception {
     // start a mini dfs cluster of 2 nodes
     final Configuration conf = new HdfsConfiguration();
-    final short REPLICATION_FACTOR = (short)2;
     final MiniDFSCluster cluster = 
       new MiniDFSCluster.Builder(conf).numDataNodes(REPLICATION_FACTOR).build();
     try {
@@ -83,13 +93,11 @@ public class TestNodeCount extends TestC
       cluster.restartDataNode(dnprop);
       cluster.waitActive();
       
-      // check if excessive replica is detected
-      NumberReplicas num = null;
-      do {
-       synchronized (namesystem) {
-         num = namesystem.blockManager.countNodes(block);
-       }
-      } while (num.excessReplicas() == 0);
+      // check if excessive replica is detected (transient)
+      initializeTimeout(TIMEOUT);
+      while (countNodes(block, namesystem).excessReplicas() == 0) {
+        checkTimeout("excess replicas not detected");
+      }
       
       // find out a non-excess node
       Iterator<DatanodeDescriptor> iter = namesystem.blockManager.blocksMap.nodeIterator(block);
@@ -119,20 +127,65 @@ public class TestNodeCount extends TestC
       }
 
       // The block should be replicated
-      do {
-        num = namesystem.blockManager.countNodes(block);
-      } while (num.liveReplicas() != REPLICATION_FACTOR);
+      initializeTimeout(TIMEOUT);
+      while (countNodes(block, namesystem).liveReplicas() != REPLICATION_FACTOR) {
+        checkTimeout("live replica count not correct", 1000);
+      }
 
       // restart the first datanode
       cluster.restartDataNode(dnprop);
       cluster.waitActive();
 
-      // check if excessive replica is detected
-      do {
-        num = namesystem.blockManager.countNodes(block);
-      } while (num.excessReplicas() != 2);
+      // check if excessive replica is detected (transient)
+      initializeTimeout(TIMEOUT);
+      while (countNodes(block, namesystem).excessReplicas() != 2) {
+        checkTimeout("excess replica count not equal to 2");
+      }
+
     } finally {
       cluster.shutdown();
     }
   }
+  
+  void initializeTimeout(long timeout) {
+    this.timeout = timeout;
+    this.failtime = System.currentTimeMillis()
+        + ((timeout <= 0) ? Long.MAX_VALUE : timeout);
+  }
+  
+  /* busy wait on transient conditions */
+  void checkTimeout(String testLabel) throws TimeoutException {
+    checkTimeout(testLabel, 0);
+  }
+  
+  /* check for timeout, then wait for cycleTime msec */
+  void checkTimeout(String testLabel, long cycleTime) throws TimeoutException {
+    if (System.currentTimeMillis() > failtime) {
+      throw new TimeoutException("Timeout: "
+          + testLabel + " for block " + lastBlock + " after " + timeout 
+          + " msec.  Last counts: live = " + lastNum.liveReplicas()
+          + ", excess = " + lastNum.excessReplicas()
+          + ", corrupt = " + lastNum.corruptReplicas());   
+    }
+    if (cycleTime > 0) {
+      try {
+        Thread.sleep(cycleTime);
+      } catch (InterruptedException ie) {
+        //ignore
+      }
+    }
+  }
+
+  /* threadsafe read of the replication counts for this block */
+  NumberReplicas countNodes(Block block, FSNamesystem namesystem) {
+    namesystem.readLock();
+    try {
+      lastBlock = block;
+      lastNum = namesystem.blockManager.countNodes(block);
+      return lastNum;
+    }
+    finally {
+      namesystem.readUnlock();
+    }
+  }
 }