You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hdfs-commits@hadoop.apache.org by co...@apache.org on 2009/11/06 18:58:24 UTC
svn commit: r833499 - in /hadoop/hdfs/trunk: CHANGES.txt
src/test/hdfs/org/apache/hadoop/hdfs/server/datanode/TestBlockReport.java
Author: cos
Date: Fri Nov 6 17:58:24 2009
New Revision: 833499
URL: http://svn.apache.org/viewvc?rev=833499&view=rev
Log:
HDFS-733. TestBlockReport fails intermittently. Contributed by Konstantin Boudnik
Modified:
hadoop/hdfs/trunk/CHANGES.txt
hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/datanode/TestBlockReport.java
Modified: hadoop/hdfs/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hdfs/trunk/CHANGES.txt?rev=833499&r1=833498&r2=833499&view=diff
==============================================================================
--- hadoop/hdfs/trunk/CHANGES.txt (original)
+++ hadoop/hdfs/trunk/CHANGES.txt Fri Nov 6 17:58:24 2009
@@ -39,6 +39,8 @@
HDFS-750. Fix build failure due to TestRename. (suresh)
+ HDFS-733. TestBlockReport fails intermittently. (cos)
+
Release 0.21.0 - Unreleased
INCOMPATIBLE CHANGES
Modified: hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/datanode/TestBlockReport.java
URL: http://svn.apache.org/viewvc/hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/datanode/TestBlockReport.java?rev=833499&r1=833498&r2=833499&view=diff
==============================================================================
--- hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/datanode/TestBlockReport.java (original)
+++ hadoop/hdfs/trunk/src/test/hdfs/org/apache/hadoop/hdfs/server/datanode/TestBlockReport.java Fri Nov 6 17:58:24 2009
@@ -272,9 +272,8 @@
Path filePath = new Path("/" + METHOD_NAME + ".dat");
final int DN_N1 = DN_N0 + 1;
- ArrayList<Block> blocks =
- writeFileAndStartDN(METHOD_NAME,
- FILE_SIZE, filePath, true);
+ ArrayList<Block> blocks = writeFile(METHOD_NAME, FILE_SIZE, filePath);
+ startDNandWait(filePath, true);
cluster.getNameNode().blockReport(
cluster.getDataNodes().get(DN_N1).dnRegistration,
@@ -306,8 +305,8 @@
final int DN_N1 = DN_N0 + 1;
// write file and start second node to be "older" than the original
- ArrayList<Block> blocks = writeFileAndStartDN(METHOD_NAME,
- FILE_SIZE, filePath, true);
+ ArrayList<Block> blocks = writeFile(METHOD_NAME, FILE_SIZE, filePath);
+ startDNandWait(filePath, true);
int randIndex = rand.nextInt(blocks.size());
// Get a block and screw its GS
@@ -378,12 +377,14 @@
shutDownCluster();
startUpCluster();
- // write file and start second node to be "older" than the original
try {
ArrayList<Block> blocks =
- writeFileAndStartDN(METHOD_NAME, 6 * bytesChkSum, filePath, false);
+ writeFile(METHOD_NAME, 6 * bytesChkSum, filePath);
+ Block bl = findBlock(filePath, 6 * bytesChkSum);
+ BlockChecker bc = new BlockChecker(filePath);
+ bc.start();
- prepareSecondReplica(filePath, DN_N1);
+ waitForTempReplica(bl, DN_N1);
cluster.getNameNode().blockReport(
cluster.getDataNodes().get(DN_N1).dnRegistration,
@@ -391,6 +392,10 @@
printStats();
assertEquals("Wrong number of PendingReplication blocks",
blocks.size(), cluster.getNamesystem().getPendingReplicationBlocks());
+
+ try {
+ bc.join();
+ } catch (InterruptedException e) { }
} finally {
resetConfiguration(); // return the initial state of the configuration
}
@@ -414,28 +419,78 @@
try {
ArrayList<Block> blocks =
- writeFileAndStartDN(METHOD_NAME, 6 * bytesChkSum, filePath, false);
- Block b = prepareSecondReplica(filePath, DN_N1);
- corruptBlockGS(b);
- corruptBlockLen(b);
+ writeFile(METHOD_NAME, 6 * bytesChkSum, filePath);
- DatanodeCommand dnC = cluster.getNameNode().blockReport(
+ Block bl = findBlock(filePath, 6 * bytesChkSum);
+ BlockChecker bc = new BlockChecker(filePath);
+ bc.start();
+ corruptBlockGS(bl);
+ corruptBlockLen(bl);
+
+ waitForTempReplica(bl, DN_N1);
+
+ cluster.getNameNode().blockReport(
cluster.getDataNodes().get(DN_N1).dnRegistration,
new BlockListAsLongs(blocks, null).getBlockListAsLongs());
- LOG.debug("Getting command back: " + dnC);
printStats();
assertEquals("Wrong number of PendingReplication blocks",
2, cluster.getNamesystem().getPendingReplicationBlocks());
+
+ try {
+ bc.join();
+ } catch (InterruptedException e) {}
} finally {
resetConfiguration(); // return the initial state of the configuration
}
}
+ private void waitForTempReplica(Block bl, int DN_N1) {
+ final boolean tooLongWait = false;
+ final int TIMEOUT = 40000;
+
+ LOG.debug("Wait for datanode " + DN_N1 + " to appear");
+ while (cluster.getDataNodes().size() <= DN_N1) {
+ waitTil(20);
+ }
+ LOG.debug("Total number of DNs " + cluster.getDataNodes().size());
+ // Look about specified DN for the replica of the block from 1st DN
+ Replica r;
+ r = ((FSDataset) cluster.getDataNodes().get(DN_N1).getFSDataset()).
+ fetchReplicaInfo(bl.getBlockId());
+ long start = System.currentTimeMillis();
+ int count = 0;
+ while (r == null) {
+ waitTil(50);
+ r = ((FSDataset) cluster.getDataNodes().get(DN_N1).getFSDataset()).
+ fetchReplicaInfo(bl.getBlockId());
+ long waiting_period = System.currentTimeMillis() - start;
+ if (count++ % 10 == 0)
+ LOG.debug("Has been waiting for " + waiting_period + " ms.");
+ if (waiting_period > TIMEOUT)
+ assertTrue("Was waiting too long to get ReplicaInfo from a datanode",
+ tooLongWait);
+ }
+
+ HdfsConstants.ReplicaState state = r.getState();
+ LOG.debug("Replica state before the loop " + state.getValue());
+ start = System.currentTimeMillis();
+ while (state != HdfsConstants.ReplicaState.TEMPORARY) {
+ waitTil(100);
+ state = r.getState();
+ LOG.debug("Keep waiting for " + bl.getBlockName() +
+ " is in state " + state.getValue());
+ if (System.currentTimeMillis() - start > TIMEOUT)
+ assertTrue("Was waiting too long for a replica to become TEMPORARY",
+ tooLongWait);
+ }
+ LOG.debug("Replica state after the loop " + state.getValue());
+ }
+
// Helper methods from here below...
- private ArrayList<Block> writeFileAndStartDN(final String METHOD_NAME,
+ // Write file and start second data node.
+ private ArrayList<Block> writeFile(final String METHOD_NAME,
final long fileSize,
- Path filePath,
- boolean waitReplicas)
+ Path filePath)
throws IOException {
ArrayList<Block> blocks = null;
try {
@@ -444,6 +499,12 @@
} catch (IOException e) {
LOG.debug("Caught exception ", e);
}
+ return blocks;
+ }
+
+ private void startDNandWait(Path filePath, boolean waitReplicas)
+ throws IOException {
+ LOG.debug("Before next DN start: " + cluster.getDataNodes().size());
cluster.startDataNodes(conf, 1, true, null, null);
ArrayList<DataNode> datanodes = cluster.getDataNodes();
assertEquals(datanodes.size(), 2);
@@ -452,7 +513,6 @@
+ cluster.getDataNodes().get(datanodes.size() - 1)
.getDatanodeRegistration() + " has been started");
if (waitReplicas) DFSTestUtil.waitReplication(fs, filePath, REPL_FACTOR);
- return blocks;
}
private ArrayList<Block> prepareForRide(final Path filePath,
@@ -543,6 +603,9 @@
private void corruptBlockLen(final Block block)
throws IOException {
+ if (block == null) {
+ throw new IOException("Block isn't suppose to be null");
+ }
long oldLen = block.getNumBytes();
long newLen = oldLen - rand.nextLong();
assertTrue("Old and new length shouldn't be the same",
@@ -554,6 +617,9 @@
private void corruptBlockGS(final Block block)
throws IOException {
+ if (block == null) {
+ throw new IOException("Block isn't suppose to be null");
+ }
long oldGS = block.getGenerationStamp();
long newGS = oldGS - rand.nextLong();
assertTrue("Old and new GS shouldn't be the same",
@@ -563,49 +629,33 @@
" is changed to " + block.getGenerationStamp() + " from " + oldGS);
}
- // The method simply start second node and wait until a TEMPORARY replica
- // appears on it.
- // Returns the block from the specified <code>nodeNum</code> datanode
- private Block prepareSecondReplica(Path filePath,
- int nodeNum) throws IOException {
-
- final boolean tooLongWait = false;
- final int TIMEOUT = 4000;
+ private Block findBlock(Path path, long size) throws IOException {
+ Block ret;
+ List<LocatedBlock> lbs =
+ cluster.getNameNode().getBlockLocations(path.toString(),
+ FILE_START, size).getLocatedBlocks();
+ LocatedBlock lb = lbs.get(lbs.size() - 1);
+
+ // Get block from the first DN
+ ret = cluster.getDataNodes().get(DN_N0).
+ data.getStoredBlock(lb.getBlock().getBlockId());
+ return ret;
+ }
- List<LocatedBlock> lbs =
- cluster.getNameNode().getBlockLocations(filePath.toString(),
- FILE_START, FILE_SIZE).getLocatedBlocks();
- LocatedBlock lb = lbs.get(lbs.size() - 1);
-
- Block ret = cluster.getDataNodes().get(DN_N0).
- data.getStoredBlock(lb.getBlock().getBlockId());
- Replica r =
- ((FSDataset) cluster.getDataNodes().get(nodeNum).getFSDataset()).
- fetchReplicaInfo(ret.getBlockId());
- long start = System.currentTimeMillis();
- while (r == null) {
- waitTil(50);
- r = ((FSDataset) cluster.getDataNodes().get(nodeNum).getFSDataset()).
- fetchReplicaInfo(ret.getBlockId());
- if (System.currentTimeMillis() - start > TIMEOUT)
- assertTrue("Was waiting too long to get ReplicaInfo from a datanode",
- tooLongWait);
+ private class BlockChecker extends Thread {
+ Path filePath;
+
+ public BlockChecker(final Path filePath) {
+ this.filePath = filePath;
}
-
- HdfsConstants.ReplicaState state = r.getState();
- LOG.debug("Replica state before the loop " + state.getValue());
- start = System.currentTimeMillis();
- while (state != HdfsConstants.ReplicaState.TEMPORARY) {
- waitTil(100);
- state = r.getState();
- LOG.debug("Keep waiting for " + ret.getBlockName() +
- " is in state " + state.getValue());
- if (System.currentTimeMillis() - start > TIMEOUT)
- assertTrue("Was waiting too long for a replica to become TEMPORARY",
- tooLongWait);
+
+ public void run() {
+ try {
+ startDNandWait(filePath, true);
+ } catch (IOException e) {
+ LOG.warn("Shouldn't happen", e);
+ }
}
- LOG.debug("Replica state after the loop " + state.getValue());
- return ret;
}
private static void resetConfiguration() {