You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ji...@apache.org on 2011/09/10 04:30:22 UTC

svn commit: r1167438 - in /hadoop/common/branches/branch-0.20-security: ./ src/hdfs/org/apache/hadoop/hdfs/server/datanode/ src/test/org/apache/hadoop/hdfs/ src/test/org/apache/hadoop/hdfs/server/namenode/

Author: jitendra
Date: Sat Sep 10 02:30:22 2011
New Revision: 1167438

URL: http://svn.apache.org/viewvc?rev=1167438&view=rev
Log:
HDFS-1260. Block lost when multiple DNs trying to recover it to different genstamps. Contributed by Todd Lipcon.

Modified:
    hadoop/common/branches/branch-0.20-security/CHANGES.txt
    hadoop/common/branches/branch-0.20-security/src/hdfs/org/apache/hadoop/hdfs/server/datanode/FSDataset.java
    hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/TestFileAppend4.java
    hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java

Modified: hadoop/common/branches/branch-0.20-security/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20-security/CHANGES.txt?rev=1167438&r1=1167437&r2=1167438&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20-security/CHANGES.txt (original)
+++ hadoop/common/branches/branch-0.20-security/CHANGES.txt Sat Sep 10 02:30:22 2011
@@ -125,6 +125,9 @@ Release 0.20.205.0 - unreleased
     HDFS-1252. Fix TestDFSConcurrentFileOperations.
     (Todd Lipcon via suresh).
 
+    HDFS-1260. Block lost when multiple DNs trying to recover it to different
+    genstamps. (Todd Lipcon via jitendra)
+
   IMPROVEMENTS
 
     MAPREDUCE-2187. Reporter sends progress during sort/merge. (Anupam Seth via

Modified: hadoop/common/branches/branch-0.20-security/src/hdfs/org/apache/hadoop/hdfs/server/datanode/FSDataset.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20-security/src/hdfs/org/apache/hadoop/hdfs/server/datanode/FSDataset.java?rev=1167438&r1=1167437&r2=1167438&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20-security/src/hdfs/org/apache/hadoop/hdfs/server/datanode/FSDataset.java (original)
+++ hadoop/common/branches/branch-0.20-security/src/hdfs/org/apache/hadoop/hdfs/server/datanode/FSDataset.java Sat Sep 10 02:30:22 2011
@@ -1208,13 +1208,8 @@ public class FSDataset implements FSCons
     File oldMetaFile = findMetaFile(blockFile);
     long oldgs = parseGenerationStamp(blockFile, oldMetaFile);
     
-    //rename meta file to a tmp file
-    File tmpMetaFile = new File(oldMetaFile.getParent(),
-        oldMetaFile.getName()+"_tmp" + newblock.getGenerationStamp());
-    if (!oldMetaFile.renameTo(tmpMetaFile)){
-      throw new IOException("Cannot rename block meta file to " + tmpMetaFile);
-    }
-
+    // First validate the update
+    
     //update generation stamp
     if (oldgs > newblock.getGenerationStamp()) {
       throw new IOException("Cannot update block (id=" + newblock.getBlockId()
@@ -1227,6 +1222,16 @@ public class FSDataset implements FSCons
       throw new IOException("Cannot update block file (=" + blockFile
           + ") length from " + oldblock.getNumBytes() + " to " + newblock.getNumBytes());
     }
+
+    // Now perform the update
+
+    //rename meta file to a tmp file
+    File tmpMetaFile = new File(oldMetaFile.getParent(),
+        oldMetaFile.getName()+"_tmp" + newblock.getGenerationStamp());
+    if (!oldMetaFile.renameTo(tmpMetaFile)){
+      throw new IOException("Cannot rename block meta file to " + tmpMetaFile);
+    }
+
     if (newblock.getNumBytes() < oldblock.getNumBytes()) {
       truncateBlock(blockFile, tmpMetaFile, oldblock.getNumBytes(), newblock.getNumBytes());
     }

Modified: hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/TestFileAppend4.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/TestFileAppend4.java?rev=1167438&r1=1167437&r2=1167438&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/TestFileAppend4.java (original)
+++ hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/TestFileAppend4.java Sat Sep 10 02:30:22 2011
@@ -1153,6 +1153,22 @@ public class TestFileAppend4 extends Tes
     private final CountDownLatch fireLatch = new CountDownLatch(1);
     private final CountDownLatch waitLatch = new CountDownLatch(1);
 
+    boolean delayBefore = true;
+
+    int numTimes = 1;
+
+    public DelayAnswer() {
+    }
+
+    /**
+     * @param delayBefore
+     *          if true, the delay is before the method is called. if false, the
+     *          delay is after the method returns.
+     */
+    public DelayAnswer(boolean delayBefore) {
+      this.delayBefore = delayBefore;
+    }
+    
     /**
      * Wait until the method is called.
      */
@@ -1169,6 +1185,19 @@ public class TestFileAppend4 extends Tes
     }
 
     public Object answer(InvocationOnMock invocation) throws Throwable {
+      if (delayBefore)
+        doDelay();
+      Object ret = invocation.callRealMethod();
+      if (!delayBefore)
+        doDelay();
+      return ret;
+    }
+    
+    private void doDelay() throws Throwable {
+      synchronized (this) {
+        if (--numTimes < 0)
+          return;
+      }
       LOG.info("DelayAnswer firing fireLatch");
       fireLatch.countDown();
       try {
@@ -1178,7 +1207,6 @@ public class TestFileAppend4 extends Tes
       } catch (InterruptedException ie) {
         throw new IOException("Interrupted waiting on latch", ie);
       }
-      return invocation.callRealMethod();
     }
   }
   
@@ -1315,6 +1343,69 @@ public class TestFileAppend4 extends Tes
     }
   }
 
+  /**
+   * Test case where recovery starts on one node, but it's very slow
+   * (delayed right after nextGenerationStamp). A second recovery attempt
+   * completes while this one is being slow. Then we should reject the
+   * recovery from the first one, since it has a lower gen stamp.
+   */
+  public void testSimultaneousRecoveries() throws Exception {
+        LOG.info("START");
+    cluster = new MiniDFSCluster(conf, 3, true, null);
+    FileSystem fs1 = cluster.getFileSystem();;
+    final FileSystem fs2 = AppendTestUtil.createHdfsWithDifferentUsername(fs1.getConf());
+    try {
+      createFile(fs1, "/testSimultaneousRecoveries", 3, BBW_SIZE);
+      stm.sync();
+      loseLeases(fs1);
+
+      // Make the first nextGenerationStamp call get delayed
+      DelayAnswer delayer = new DelayAnswer(false);
+
+      NameNode nn = cluster.getNameNode();
+      nn.namesystem = spy(nn.namesystem);
+      NameNodeAdapter.callNextGenerationStampForBlock(
+        doAnswer(delayer).when(nn.namesystem),
+        (Block)anyObject(), anyBoolean());
+
+      final AtomicReference<Throwable> err = new AtomicReference<Throwable>();
+      Thread recoverThread = new Thread("Recovery thread") {
+        public void run() {
+          try {
+            recoverFile(fs2);
+          } catch (Throwable t) {
+            err.set(t);
+          }
+        }
+      };
+      recoverThread.start();
+
+      LOG.info("Waiting for first nextGenerationStamp to return");
+      delayer.waitForCall();
+
+      LOG.info("Allowing recovery time to try again");
+      Thread.sleep(10000);
+
+      LOG.info("Proceeding first recovery with old GS");
+      delayer.proceed();
+
+      LOG.info("Joining on recovery thread");
+      recoverThread.join();
+
+      LOG.info("Waiting a few seconds for blocks to get corrupted");
+      Thread.sleep(5000);
+
+      // close() should write recovered bbw to HDFS block
+      assertFileSize(fs2, BBW_SIZE); 
+      checkFile(fs2, BBW_SIZE);
+    } finally {
+      fs2.close();
+      fs1.close();
+      cluster.shutdown();
+    }
+    LOG.info("STOP");
+  }
+
   
   /**
    * Mockito answer helper that will throw an exception a given number

Modified: hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java?rev=1167438&r1=1167437&r2=1167438&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java (original)
+++ hadoop/common/branches/branch-0.20-security/src/test/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java Sat Sep 10 02:30:22 2011
@@ -17,11 +17,18 @@
  */
 package org.apache.hadoop.hdfs.server.namenode;
 import java.io.IOException;
+import org.apache.hadoop.hdfs.protocol.Block;
 
 public abstract class NameNodeAdapter {
   public static boolean checkFileProgress(FSNamesystem fsn, String path, boolean checkall) throws IOException {
     INodeFile f = fsn.dir.getFileINode(path);
     return fsn.checkFileProgress(f, checkall);
   }
+
+  public static long callNextGenerationStampForBlock(
+    FSNamesystem fsn, Block block, boolean fromNN) throws IOException {
+    return fsn.nextGenerationStampForBlock(block, fromNN);
+  }
+
 }