You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ji...@apache.org on 2011/09/10 04:30:22 UTC
svn commit: r1167438 - in /hadoop/common/branches/branch-0.20-security: ./
src/hdfs/org/apache/hadoop/hdfs/server/datanode/
src/test/org/apache/hadoop/hdfs/
src/test/org/apache/hadoop/hdfs/server/namenode/
Author: jitendra
Date: Sat Sep 10 02:30:22 2011
New Revision: 1167438
URL: http://svn.apache.org/viewvc?rev=1167438&view=rev
Log:
HDFS-1260. Block lost when multiple DNs trying to recover it to different genstamps. Contributed by Todd Lipcon.
Modified:
hadoop/common/branches/branch-0.20-security/CHANGES.txt
hadoop/common/branches/branch-0.20-security/src/hdfs/org/apache/hadoop/hdfs/server/datanode/FSDataset.java
hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/TestFileAppend4.java
hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java
Modified: hadoop/common/branches/branch-0.20-security/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20-security/CHANGES.txt?rev=1167438&r1=1167437&r2=1167438&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20-security/CHANGES.txt (original)
+++ hadoop/common/branches/branch-0.20-security/CHANGES.txt Sat Sep 10 02:30:22 2011
@@ -125,6 +125,9 @@ Release 0.20.205.0 - unreleased
HDFS-1252. Fix TestDFSConcurrentFileOperations.
(Todd Lipcon via suresh).
+ HDFS-1260. Block lost when multiple DNs trying to recover it to different
+ genstamps. (Todd Lipcon via jitendra)
+
IMPROVEMENTS
MAPREDUCE-2187. Reporter sends progress during sort/merge. (Anupam Seth via
Modified: hadoop/common/branches/branch-0.20-security/src/hdfs/org/apache/hadoop/hdfs/server/datanode/FSDataset.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20-security/src/hdfs/org/apache/hadoop/hdfs/server/datanode/FSDataset.java?rev=1167438&r1=1167437&r2=1167438&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20-security/src/hdfs/org/apache/hadoop/hdfs/server/datanode/FSDataset.java (original)
+++ hadoop/common/branches/branch-0.20-security/src/hdfs/org/apache/hadoop/hdfs/server/datanode/FSDataset.java Sat Sep 10 02:30:22 2011
@@ -1208,13 +1208,8 @@ public class FSDataset implements FSCons
File oldMetaFile = findMetaFile(blockFile);
long oldgs = parseGenerationStamp(blockFile, oldMetaFile);
- //rename meta file to a tmp file
- File tmpMetaFile = new File(oldMetaFile.getParent(),
- oldMetaFile.getName()+"_tmp" + newblock.getGenerationStamp());
- if (!oldMetaFile.renameTo(tmpMetaFile)){
- throw new IOException("Cannot rename block meta file to " + tmpMetaFile);
- }
-
+ // First validate the update
+
//update generation stamp
if (oldgs > newblock.getGenerationStamp()) {
throw new IOException("Cannot update block (id=" + newblock.getBlockId()
@@ -1227,6 +1222,16 @@ public class FSDataset implements FSCons
throw new IOException("Cannot update block file (=" + blockFile
+ ") length from " + oldblock.getNumBytes() + " to " + newblock.getNumBytes());
}
+
+ // Now perform the update
+
+ //rename meta file to a tmp file
+ File tmpMetaFile = new File(oldMetaFile.getParent(),
+ oldMetaFile.getName()+"_tmp" + newblock.getGenerationStamp());
+ if (!oldMetaFile.renameTo(tmpMetaFile)){
+ throw new IOException("Cannot rename block meta file to " + tmpMetaFile);
+ }
+
if (newblock.getNumBytes() < oldblock.getNumBytes()) {
truncateBlock(blockFile, tmpMetaFile, oldblock.getNumBytes(), newblock.getNumBytes());
}
Modified: hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/TestFileAppend4.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/TestFileAppend4.java?rev=1167438&r1=1167437&r2=1167438&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/TestFileAppend4.java (original)
+++ hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/TestFileAppend4.java Sat Sep 10 02:30:22 2011
@@ -1153,6 +1153,22 @@ public class TestFileAppend4 extends Tes
private final CountDownLatch fireLatch = new CountDownLatch(1);
private final CountDownLatch waitLatch = new CountDownLatch(1);
+ boolean delayBefore = true;
+
+ int numTimes = 1;
+
+ public DelayAnswer() {
+ }
+
+ /**
+ * @param delayBefore
+ * if true, the delay is before the method is called. if false, the
+ * delay is after the method returns.
+ */
+ public DelayAnswer(boolean delayBefore) {
+ this.delayBefore = delayBefore;
+ }
+
/**
* Wait until the method is called.
*/
@@ -1169,6 +1185,19 @@ public class TestFileAppend4 extends Tes
}
public Object answer(InvocationOnMock invocation) throws Throwable {
+ if (delayBefore)
+ doDelay();
+ Object ret = invocation.callRealMethod();
+ if (!delayBefore)
+ doDelay();
+ return ret;
+ }
+
+ private void doDelay() throws Throwable {
+ synchronized (this) {
+ if (--numTimes < 0)
+ return;
+ }
LOG.info("DelayAnswer firing fireLatch");
fireLatch.countDown();
try {
@@ -1178,7 +1207,6 @@ public class TestFileAppend4 extends Tes
} catch (InterruptedException ie) {
throw new IOException("Interrupted waiting on latch", ie);
}
- return invocation.callRealMethod();
}
}
@@ -1315,6 +1343,69 @@ public class TestFileAppend4 extends Tes
}
}
+ /**
+ * Test case where recovery starts on one node, but it's very slow
+ * (delayed right after nextGenerationStamp). A second recovery attempt
+ * completes while this one is being slow. Then we should reject the
+ * recovery from the first one, since it has a lower gen stamp.
+ */
+ public void testSimultaneousRecoveries() throws Exception {
+ LOG.info("START");
+ cluster = new MiniDFSCluster(conf, 3, true, null);
+ FileSystem fs1 = cluster.getFileSystem();;
+ final FileSystem fs2 = AppendTestUtil.createHdfsWithDifferentUsername(fs1.getConf());
+ try {
+ createFile(fs1, "/testSimultaneousRecoveries", 3, BBW_SIZE);
+ stm.sync();
+ loseLeases(fs1);
+
+ // Make the first nextGenerationStamp call get delayed
+ DelayAnswer delayer = new DelayAnswer(false);
+
+ NameNode nn = cluster.getNameNode();
+ nn.namesystem = spy(nn.namesystem);
+ NameNodeAdapter.callNextGenerationStampForBlock(
+ doAnswer(delayer).when(nn.namesystem),
+ (Block)anyObject(), anyBoolean());
+
+ final AtomicReference<Throwable> err = new AtomicReference<Throwable>();
+ Thread recoverThread = new Thread("Recovery thread") {
+ public void run() {
+ try {
+ recoverFile(fs2);
+ } catch (Throwable t) {
+ err.set(t);
+ }
+ }
+ };
+ recoverThread.start();
+
+ LOG.info("Waiting for first nextGenerationStamp to return");
+ delayer.waitForCall();
+
+ LOG.info("Allowing recovery time to try again");
+ Thread.sleep(10000);
+
+ LOG.info("Proceeding first recovery with old GS");
+ delayer.proceed();
+
+ LOG.info("Joining on recovery thread");
+ recoverThread.join();
+
+ LOG.info("Waiting a few seconds for blocks to get corrupted");
+ Thread.sleep(5000);
+
+ // close() should write recovered bbw to HDFS block
+ assertFileSize(fs2, BBW_SIZE);
+ checkFile(fs2, BBW_SIZE);
+ } finally {
+ fs2.close();
+ fs1.close();
+ cluster.shutdown();
+ }
+ LOG.info("STOP");
+ }
+
/**
* Mockito answer helper that will throw an exception a given number
Modified: hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java?rev=1167438&r1=1167437&r2=1167438&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java (original)
+++ hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java Sat Sep 10 02:30:22 2011
@@ -17,11 +17,18 @@
*/
package org.apache.hadoop.hdfs.server.namenode;
import java.io.IOException;
+import org.apache.hadoop.hdfs.protocol.Block;
public abstract class NameNodeAdapter {
public static boolean checkFileProgress(FSNamesystem fsn, String path, boolean checkall) throws IOException {
INodeFile f = fsn.dir.getFileINode(path);
return fsn.checkFileProgress(f, checkall);
}
+
+ public static long callNextGenerationStampForBlock(
+ FSNamesystem fsn, Block block, boolean fromNN) throws IOException {
+ return fsn.nextGenerationStampForBlock(block, fromNN);
+ }
+
}