You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hdfs-commits@hadoop.apache.org by to...@apache.org on 2011/06/25 01:32:34 UTC
svn commit: r1139463 - in /hadoop/common/branches/HDFS-1073/hdfs: ./
src/java/org/apache/hadoop/hdfs/server/namenode/
src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/
src/test/hdfs/org/apache/hadoop/test/
Author: todd
Date: Fri Jun 24 23:32:33 2011
New Revision: 1139463
URL: http://svn.apache.org/viewvc?rev=1139463&view=rev
Log:
HDFS-2093. Handle case where an entirely empty log is left during NN crash. Contributed by Todd Lipcon.
Modified:
hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt
hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java
hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java
hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java
hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestFSImageStorageInspector.java
hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java
Modified: hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt?rev=1139463&r1=1139462&r2=1139463&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt Fri Jun 24 23:32:33 2011
@@ -60,3 +60,5 @@ HDFS-2077. Address checkpoint upload whe
HDFS-2078. NameNode should not clear directory when restoring removed storage.
(todd)
HDFS-2088. Move edits log archiving logic into FSEditLog/JournalManager (todd)
+HDFS-2093. Handle case where an entirely empty log is left during NN crash
+ (todd)
Modified: hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java?rev=1139463&r1=1139462&r2=1139463&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java Fri Jun 24 23:32:33 2011
@@ -162,6 +162,7 @@ class FSDirectory implements Closeable {
startOpt = StartupOption.REGULAR;
}
+ boolean success = false;
try {
if (fsImage.recoverTransitionRead(startOpt)) {
fsImage.saveNamespace();
@@ -169,9 +170,11 @@ class FSDirectory implements Closeable {
fsImage.openEditLog();
fsImage.setCheckpointDirectories(null, null);
- } catch(IOException e) {
- fsImage.close();
- throw e;
+ success = true;
+ } finally {
+ if (!success) {
+ fsImage.close();
+ }
}
writeLock();
try {
Modified: hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java?rev=1139463&r1=1139462&r2=1139463&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java Fri Jun 24 23:32:33 2011
@@ -413,6 +413,15 @@ class FSImageTransactionalStorageInspect
"crash)");
if (logs.size() == 1) {
// Only one log, it's our only choice!
+ FoundEditLog log = logs.get(0);
+ if (log.validateLog().numTransactions == 0) {
+ // If it has no transactions, we should consider it corrupt just
+ // to be conservative.
+ // See comment below for similar case
+ LOG.warn("Marking log at " + log.getFile() + " as corrupt since " +
+ "it has no transactions in it.");
+ log.markCorrupt();
+ }
return;
}
@@ -430,6 +439,13 @@ class FSImageTransactionalStorageInspect
"it is has only " + txns + " valid txns whereas another " +
"log has " + maxValidTxnCount);
log.markCorrupt();
+ } else if (txns == 0) {
+ // this can happen if the NN crashes right after rolling a log
+ // but before the START_LOG_SEGMENT txn is written. Since the log
+ // is empty, we can just move it aside to its corrupt name.
+ LOG.warn("Marking log at " + log.getFile() + " as corrupt since " +
+ "it has no transactions in it.");
+ log.markCorrupt();
}
}
}
@@ -517,6 +533,8 @@ class FSImageTransactionalStorageInspect
long lastTxId = startTxId + numTransactions - 1;
File dst = new File(file.getParentFile(),
NNStorage.getFinalizedEditsFileName(startTxId, lastTxId));
+ LOG.info("Finalizing edits log " + file + " by renaming to "
+ + dst.getName());
if (!file.renameTo(dst)) {
throw new IOException("Couldn't finalize log " +
file + " to " + dst);
Modified: hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java?rev=1139463&r1=1139462&r2=1139463&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java Fri Jun 24 23:32:33 2011
@@ -21,6 +21,7 @@ import junit.framework.TestCase;
import java.io.*;
import java.net.URI;
import java.util.Arrays;
+import java.util.Collection;
import java.util.Iterator;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
@@ -46,12 +47,15 @@ import org.apache.hadoop.hdfs.server.com
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
+import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.util.StringUtils;
import org.apache.log4j.Level;
import org.aspectj.util.FileUtil;
import org.mockito.Mockito;
+import com.google.common.collect.Lists;
+
import static org.apache.hadoop.test.MetricsAsserts.*;
import static org.junit.Assert.fail;
import static org.mockito.Matchers.anyObject;
@@ -606,6 +610,63 @@ public class TestEditLog extends TestCas
}
}
+ public void testCrashRecoveryEmptyLogOneDir() throws Exception {
+ doTestCrashRecoveryEmptyLog(false);
+ }
+
+ public void testCrashRecoveryEmptyLogBothDirs() throws Exception {
+ doTestCrashRecoveryEmptyLog(true);
+ }
+
+ /**
+ * Test that the NN handles the corruption properly
+ * after it crashes just after creating an edit log
+ * (ie before writing START_LOG_SEGMENT). In the case
+ * that all logs have this problem, it should mark them
+ * as corrupt instead of trying to finalize them.
+ *
+ * @param inBothDirs if true, there will be a truncated log in
+ * both of the edits directories. If false, the truncated log
+ * will only be in one of the directories. In both cases, the
+ * NN should fail to start up, because it's aware that txid 3
+ * was reached, but unable to find a non-corrupt log starting there.
+ */
+ private void doTestCrashRecoveryEmptyLog(boolean inBothDirs) throws Exception {
+ // start a cluster
+ Configuration conf = new HdfsConfiguration();
+ MiniDFSCluster cluster = null;
+ cluster = new MiniDFSCluster.Builder(conf)
+ .numDataNodes(NUM_DATA_NODES).build();
+ cluster.shutdown();
+
+ Collection<URI> editsDirs = cluster.getNameEditsDirs(0);
+ for (URI uri : editsDirs) {
+ File dir = new File(uri.getPath());
+ File currentDir = new File(dir, "current");
+ // We should start with only the finalized edits_1-2
+ GenericTestUtils.assertGlobEquals(currentDir, "edits_.*",
+ "edits_1-2");
+ // Make a truncated edits_3_inprogress
+ File log = new File(currentDir,
+ NNStorage.getInProgressEditsFileName(3));
+ new EditLogFileOutputStream(log, 1024).create();
+ if (!inBothDirs) {
+ break;
+ }
+ }
+
+ try {
+ cluster = new MiniDFSCluster.Builder(conf)
+ .numDataNodes(NUM_DATA_NODES).format(false).build();
+ fail("Did not fail to start with all-corrupt logs");
+ } catch (IllegalStateException ise) {
+ GenericTestUtils.assertExceptionContains(
+ "No non-corrupt logs for txid 3", ise);
+ }
+ cluster.shutdown();
+ }
+
+
private static class EditLogByteInputStream extends EditLogInputStream {
private InputStream input;
private long len;
Modified: hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestFSImageStorageInspector.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestFSImageStorageInspector.java?rev=1139463&r1=1139462&r2=1139463&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestFSImageStorageInspector.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestFSImageStorageInspector.java Fri Jun 24 23:32:33 2011
@@ -25,6 +25,7 @@ import static org.mockito.Mockito.spy;
import java.io.File;
import java.io.IOException;
+import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -60,6 +61,8 @@ public class TestFSImageStorageInspector
"/foo/current/edits_inprogress_457");
inspector.inspectDirectory(mockDir);
+ mockLogValidation(inspector,
+ "/foo/current/edits_inprogress_457", 10);
assertEquals(2, inspector.foundEditLogs.size());
assertEquals(2, inspector.foundImages.size());
@@ -125,6 +128,8 @@ public class TestFSImageStorageInspector
"/foo/current/edits_952-1000");
inspector.inspectDirectory(mockDir);
+ mockLogValidation(inspector,
+ "/foo/current/edits_inprogress_901", 51);
LoadPlan plan = inspector.createLoadPlan();
LOG.info("Plan: " + plan);
@@ -237,30 +242,25 @@ public class TestFSImageStorageInspector
*/
@Test
public void testLogGroupRecoveryInProgress() throws IOException {
+ String paths[] = new String[] {
+ "/foo1/current/edits_inprogress_123",
+ "/foo2/current/edits_inprogress_123",
+ "/foo3/current/edits_inprogress_123"
+ };
FSImageTransactionalStorageInspector inspector =
new FSImageTransactionalStorageInspector();
- inspector.inspectDirectory(
- mockDirectoryWithEditLogs("/foo1/current/edits_inprogress_123"));
- inspector.inspectDirectory(
- mockDirectoryWithEditLogs("/foo2/current/edits_inprogress_123"));
- inspector.inspectDirectory(
- mockDirectoryWithEditLogs("/foo3/current/edits_inprogress_123"));
+ inspector.inspectDirectory(mockDirectoryWithEditLogs(paths[0]));
+ inspector.inspectDirectory(mockDirectoryWithEditLogs(paths[1]));
+ inspector.inspectDirectory(mockDirectoryWithEditLogs(paths[2]));
+
+ // Inject spies to return the valid counts we would like to see
+ mockLogValidation(inspector, paths[0], 2000);
+ mockLogValidation(inspector, paths[1], 2000);
+ mockLogValidation(inspector, paths[2], 1000);
LogGroup lg = inspector.logGroups.get(123L);
assertEquals(3, lg.logs.size());
- // Inject spies to return the valid counts we would like to see
- long validTxnCounts[] = new long[] { 2000, 2000, 1000 };
- for (int i = 0; i < 3; i++) {
- FoundEditLog inProgressLog = lg.logs.get(i);
- assertTrue(inProgressLog.isInProgress());
-
- inProgressLog = spy(inProgressLog);
- doReturn(new FSEditLogLoader.EditLogValidation(-1, validTxnCounts[i]))
- .when(inProgressLog).validateLog();
- lg.logs.set(i, inProgressLog);
- }
-
lg.planRecovery();
// Check that the short one was marked corrupt
@@ -280,7 +280,32 @@ public class TestFSImageStorageInspector
Mockito.verify(lg.logs.get(0)).finalizeLog();
Mockito.verify(lg.logs.get(1)).finalizeLog();
}
-
+
+ /**
+ * Mock out the log at the given path to return a specified number
+ * of transactions upon validation.
+ */
+ private void mockLogValidation(
+ FSImageTransactionalStorageInspector inspector,
+ String path, int numValidTransactions) throws IOException {
+
+ for (LogGroup lg : inspector.logGroups.values()) {
+ List<FoundEditLog> logs = lg.logs;
+ for (int i = 0; i < logs.size(); i++) {
+ FoundEditLog log = logs.get(i);
+ if (log.file.getPath().equals(path)) {
+ // mock out its validation
+ FoundEditLog spyLog = spy(log);
+ doReturn(new FSEditLogLoader.EditLogValidation(-1, numValidTransactions))
+ .when(spyLog).validateLog();
+ logs.set(i, spyLog);
+ return;
+ }
+ }
+ }
+ fail("No log found to mock out at " + path);
+ }
+
/**
* Test when edits and image are in separate directories.
*/
@@ -306,6 +331,9 @@ public class TestFSImageStorageInspector
inspector.inspectDirectory(mockImageDir);
inspector.inspectDirectory(mockEditsDir);
inspector.inspectDirectory(mockImageDir2);
+
+ mockLogValidation(inspector,
+ "/foo3/current/edits_inprogress_457", 2);
assertEquals(2, inspector.foundEditLogs.size());
assertEquals(2, inspector.foundImages.size());
Modified: hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java?rev=1139463&r1=1139462&r2=1139463&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java Fri Jun 24 23:32:33 2011
@@ -77,12 +77,11 @@ public abstract class GenericTestUtils {
Joiner.on(",").join(expectedSet));
}
- public static void assertExceptionContains(String string, IOException ioe) {
- String msg = ioe.getMessage();
+ public static void assertExceptionContains(String string, Throwable t) {
+ String msg = t.getMessage();
Assert.assertTrue(
- "Unexpected exception:" + StringUtils.stringifyException(ioe),
- msg.contains(string));
-
+ "Unexpected exception:" + StringUtils.stringifyException(t),
+ msg.contains(string));
}
/**