You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hdfs-commits@hadoop.apache.org by to...@apache.org on 2011/06/25 01:32:34 UTC

svn commit: r1139463 - in /hadoop/common/branches/HDFS-1073/hdfs: ./ src/java/org/apache/hadoop/hdfs/server/namenode/ src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/ src/test/hdfs/org/apache/hadoop/test/

Author: todd
Date: Fri Jun 24 23:32:33 2011
New Revision: 1139463

URL: http://svn.apache.org/viewvc?rev=1139463&view=rev
Log:
HDFS-2093. Handle case where an entirely empty log is left during NN crash. Contributed by Todd Lipcon.

Modified:
    hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt
    hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java
    hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java
    hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java
    hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestFSImageStorageInspector.java
    hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java

Modified: hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt?rev=1139463&r1=1139462&r2=1139463&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt Fri Jun 24 23:32:33 2011
@@ -60,3 +60,5 @@ HDFS-2077. Address checkpoint upload whe
 HDFS-2078. NameNode should not clear directory when restoring removed storage.
            (todd)
 HDFS-2088. Move edits log archiving logic into FSEditLog/JournalManager (todd)
+HDFS-2093. Handle case where an entirely empty log is left during NN crash
+           (todd)

Modified: hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java?rev=1139463&r1=1139462&r2=1139463&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java Fri Jun 24 23:32:33 2011
@@ -162,6 +162,7 @@ class FSDirectory implements Closeable {
 
       startOpt = StartupOption.REGULAR;
     }
+    boolean success = false;
     try {
       if (fsImage.recoverTransitionRead(startOpt)) {
         fsImage.saveNamespace();
@@ -169,9 +170,11 @@ class FSDirectory implements Closeable {
       fsImage.openEditLog();
       
       fsImage.setCheckpointDirectories(null, null);
-    } catch(IOException e) {
-      fsImage.close();
-      throw e;
+      success = true;
+    } finally {
+      if (!success) {
+        fsImage.close();
+      }
     }
     writeLock();
     try {

Modified: hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java?rev=1139463&r1=1139462&r2=1139463&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java Fri Jun 24 23:32:33 2011
@@ -413,6 +413,15 @@ class FSImageTransactionalStorageInspect
                "crash)");
       if (logs.size() == 1) {
         // Only one log, it's our only choice!
+        FoundEditLog log = logs.get(0);
+        if (log.validateLog().numTransactions == 0) {
+          // If it has no transactions, we should consider it corrupt just
+          // to be conservative.
+          // See comment below for similar case
+          LOG.warn("Marking log at " + log.getFile() + " as corrupt since " +
+              "it has no transactions in it.");
+          log.markCorrupt();          
+        }
         return;
       }
 
@@ -430,6 +439,13 @@ class FSImageTransactionalStorageInspect
                    "it is has only " + txns + " valid txns whereas another " +
                    "log has " + maxValidTxnCount);
           log.markCorrupt();
+        } else if (txns == 0) {
+          // this can happen if the NN crashes right after rolling a log
+          // but before the START_LOG_SEGMENT txn is written. Since the log
+          // is empty, we can just move it aside to its corrupt name.
+          LOG.warn("Marking log at " + log.getFile() + " as corrupt since " +
+              "it has no transactions in it.");
+          log.markCorrupt();
         }
       }
     }
@@ -517,6 +533,8 @@ class FSImageTransactionalStorageInspect
       long lastTxId = startTxId + numTransactions - 1;
       File dst = new File(file.getParentFile(),
           NNStorage.getFinalizedEditsFileName(startTxId, lastTxId));
+      LOG.info("Finalizing edits log " + file + " by renaming to "
+          + dst.getName());
       if (!file.renameTo(dst)) {
         throw new IOException("Couldn't finalize log " +
             file + " to " + dst);

Modified: hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java?rev=1139463&r1=1139462&r2=1139463&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java Fri Jun 24 23:32:33 2011
@@ -21,6 +21,7 @@ import junit.framework.TestCase;
 import java.io.*;
 import java.net.URI;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.Iterator;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutorService;
@@ -46,12 +47,15 @@ import org.apache.hadoop.hdfs.server.com
 import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
 import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
 import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
+import org.apache.hadoop.test.GenericTestUtils;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.log4j.Level;
 import org.aspectj.util.FileUtil;
 
 import org.mockito.Mockito;
 
+import com.google.common.collect.Lists;
+
 import static org.apache.hadoop.test.MetricsAsserts.*;
 import static org.junit.Assert.fail;
 import static org.mockito.Matchers.anyObject;
@@ -606,6 +610,63 @@ public class TestEditLog extends TestCas
     }
   }
   
+  public void testCrashRecoveryEmptyLogOneDir() throws Exception {
+    doTestCrashRecoveryEmptyLog(false);
+  }
+  
+  public void testCrashRecoveryEmptyLogBothDirs() throws Exception {
+    doTestCrashRecoveryEmptyLog(true);
+  }
+  
+  /**
+   * Test that the NN handles the corruption properly
+   * after it crashes just after creating an edit log
+   * (ie before writing START_LOG_SEGMENT). In the case
+   * that all logs have this problem, it should mark them
+   * as corrupt instead of trying to finalize them.
+   * 
+   * @param inBothDirs if true, there will be a truncated log in
+   * both of the edits directories. If false, the truncated log
+   * will only be in one of the directories. In both cases, the
+   * NN should fail to start up, because it's aware that txid 3
+   * was reached, but unable to find a non-corrupt log starting there.
+   */
+  private void doTestCrashRecoveryEmptyLog(boolean inBothDirs) throws Exception {
+    // start a cluster 
+    Configuration conf = new HdfsConfiguration();
+    MiniDFSCluster cluster = null;
+    cluster = new MiniDFSCluster.Builder(conf)
+      .numDataNodes(NUM_DATA_NODES).build();
+    cluster.shutdown();
+    
+    Collection<URI> editsDirs = cluster.getNameEditsDirs(0);
+    for (URI uri : editsDirs) {
+      File dir = new File(uri.getPath());
+      File currentDir = new File(dir, "current");
+      // We should start with only the finalized edits_1-2
+      GenericTestUtils.assertGlobEquals(currentDir, "edits_.*",
+          "edits_1-2");
+      // Make a truncated edits_3_inprogress
+      File log = new File(currentDir,
+          NNStorage.getInProgressEditsFileName(3));
+      new EditLogFileOutputStream(log, 1024).create();
+      if (!inBothDirs) {
+        break;
+      }
+    }
+    
+    try {
+      cluster = new MiniDFSCluster.Builder(conf)
+        .numDataNodes(NUM_DATA_NODES).format(false).build();
+      fail("Did not fail to start with all-corrupt logs");
+    } catch (IllegalStateException ise) {
+      GenericTestUtils.assertExceptionContains(
+          "No non-corrupt logs for txid 3", ise);
+    }
+    cluster.shutdown();
+  }
+
+  
   private static class EditLogByteInputStream extends EditLogInputStream {
     private InputStream input;
     private long len;

Modified: hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestFSImageStorageInspector.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestFSImageStorageInspector.java?rev=1139463&r1=1139462&r2=1139463&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestFSImageStorageInspector.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestFSImageStorageInspector.java Fri Jun 24 23:32:33 2011
@@ -25,6 +25,7 @@ import static org.mockito.Mockito.spy;
 
 import java.io.File;
 import java.io.IOException;
+import java.util.List;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -60,6 +61,8 @@ public class TestFSImageStorageInspector
         "/foo/current/edits_inprogress_457");
 
     inspector.inspectDirectory(mockDir);
+    mockLogValidation(inspector,
+        "/foo/current/edits_inprogress_457", 10);
     
     assertEquals(2, inspector.foundEditLogs.size());
     assertEquals(2, inspector.foundImages.size());
@@ -125,6 +128,8 @@ public class TestFSImageStorageInspector
         "/foo/current/edits_952-1000");
 
     inspector.inspectDirectory(mockDir);
+    mockLogValidation(inspector,
+        "/foo/current/edits_inprogress_901", 51);
 
     LoadPlan plan = inspector.createLoadPlan();
     LOG.info("Plan: " + plan);
@@ -237,30 +242,25 @@ public class TestFSImageStorageInspector
    */
   @Test
   public void testLogGroupRecoveryInProgress() throws IOException {
+    String paths[] = new String[] {
+        "/foo1/current/edits_inprogress_123",
+        "/foo2/current/edits_inprogress_123",
+        "/foo3/current/edits_inprogress_123"
+    };
     FSImageTransactionalStorageInspector inspector =
         new FSImageTransactionalStorageInspector();
-    inspector.inspectDirectory(
-        mockDirectoryWithEditLogs("/foo1/current/edits_inprogress_123"));
-    inspector.inspectDirectory(
-        mockDirectoryWithEditLogs("/foo2/current/edits_inprogress_123"));
-    inspector.inspectDirectory(
-        mockDirectoryWithEditLogs("/foo3/current/edits_inprogress_123"));
+    inspector.inspectDirectory(mockDirectoryWithEditLogs(paths[0]));
+    inspector.inspectDirectory(mockDirectoryWithEditLogs(paths[1]));
+    inspector.inspectDirectory(mockDirectoryWithEditLogs(paths[2]));
+
+    // Inject spies to return the valid counts we would like to see
+    mockLogValidation(inspector, paths[0], 2000);
+    mockLogValidation(inspector, paths[1], 2000);
+    mockLogValidation(inspector, paths[2], 1000);
 
     LogGroup lg = inspector.logGroups.get(123L);
     assertEquals(3, lg.logs.size());
     
-    // Inject spies to return the valid counts we would like to see
-    long validTxnCounts[] = new long[] { 2000, 2000, 1000 };
-    for (int i = 0; i < 3; i++) {
-      FoundEditLog inProgressLog = lg.logs.get(i);
-      assertTrue(inProgressLog.isInProgress());
-      
-      inProgressLog = spy(inProgressLog);
-      doReturn(new FSEditLogLoader.EditLogValidation(-1, validTxnCounts[i]))
-        .when(inProgressLog).validateLog();
-      lg.logs.set(i, inProgressLog);      
-    }
-
     lg.planRecovery();
     
     // Check that the short one was marked corrupt
@@ -280,7 +280,32 @@ public class TestFSImageStorageInspector
     Mockito.verify(lg.logs.get(0)).finalizeLog();
     Mockito.verify(lg.logs.get(1)).finalizeLog();
   }
-  
+
+  /**
+   * Mock out the log at the given path to return a specified number
+   * of transactions upon validation.
+   */
+  private void mockLogValidation(
+      FSImageTransactionalStorageInspector inspector,
+      String path, int numValidTransactions) throws IOException {
+    
+    for (LogGroup lg : inspector.logGroups.values()) {
+      List<FoundEditLog> logs = lg.logs;
+      for (int i = 0; i < logs.size(); i++) {
+        FoundEditLog log = logs.get(i);
+        if (log.file.getPath().equals(path)) {
+          // mock out its validation
+          FoundEditLog spyLog = spy(log);
+          doReturn(new FSEditLogLoader.EditLogValidation(-1, numValidTransactions))
+            .when(spyLog).validateLog();
+          logs.set(i, spyLog);
+          return;
+        }
+      }
+    }
+    fail("No log found to mock out at " + path);
+  }
+
   /**
    * Test when edits and image are in separate directories.
    */
@@ -306,6 +331,9 @@ public class TestFSImageStorageInspector
     inspector.inspectDirectory(mockImageDir);
     inspector.inspectDirectory(mockEditsDir);
     inspector.inspectDirectory(mockImageDir2);
+    
+    mockLogValidation(inspector,
+        "/foo3/current/edits_inprogress_457", 2);
 
     assertEquals(2, inspector.foundEditLogs.size());
     assertEquals(2, inspector.foundImages.size());

Modified: hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java?rev=1139463&r1=1139462&r2=1139463&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java Fri Jun 24 23:32:33 2011
@@ -77,12 +77,11 @@ public abstract class GenericTestUtils {
         Joiner.on(",").join(expectedSet));
   }
   
-  public static void assertExceptionContains(String string, IOException ioe) {
-    String msg = ioe.getMessage();
+  public static void assertExceptionContains(String string, Throwable t) {
+    String msg = t.getMessage();
     Assert.assertTrue(
-        "Unexpected exception:" + StringUtils.stringifyException(ioe),
-        msg.contains(string));
-    
+        "Unexpected exception:" + StringUtils.stringifyException(t),
+        msg.contains(string));    
   }  
 
   /**