You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by sz...@apache.org on 2012/03/23 02:41:16 UTC
svn commit: r1304161 - in /hadoop/common/branches/branch-1.0: ./ CHANGES.txt
src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java
src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java
Author: szetszwo
Date: Fri Mar 23 01:41:15 2012
New Revision: 1304161
URL: http://svn.apache.org/viewvc?rev=1304161&view=rev
Log:
svn merge -c 1304158 from branch-1 for HDFS-3127.
Modified:
hadoop/common/branches/branch-1.0/ (props changed)
hadoop/common/branches/branch-1.0/CHANGES.txt (contents, props changed)
hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java
hadoop/common/branches/branch-1.0/src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java
Propchange: hadoop/common/branches/branch-1.0/
------------------------------------------------------------------------------
Merged /hadoop/common/branches/branch-1:r1304158
Modified: hadoop/common/branches/branch-1.0/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.0/CHANGES.txt?rev=1304161&r1=1304160&r2=1304161&view=diff
==============================================================================
--- hadoop/common/branches/branch-1.0/CHANGES.txt (original)
+++ hadoop/common/branches/branch-1.0/CHANGES.txt Fri Mar 23 01:41:15 2012
@@ -71,6 +71,9 @@ Release 1.0.2 - 2012.03.18
HADOOP-8132. 64bit secure datanodes do not start as the jsvc path is wrong
(Arpit Gupta via mattf)
+ HDFS-3127. Do not throw exceptions when FSImage.restoreStorageDirs() failes.
+ (Brandon Li via szetszwo)
+
Release 1.0.1 - 2012.02.14
NEW FEATURES
Propchange: hadoop/common/branches/branch-1.0/CHANGES.txt
------------------------------------------------------------------------------
Merged /hadoop/common/branches/branch-1/CHANGES.txt:r1304158
Modified: hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java?rev=1304161&r1=1304160&r2=1304161&view=diff
==============================================================================
--- hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java (original)
+++ hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java Fri Mar 23 01:41:15 2012
@@ -1234,21 +1234,23 @@ public class FSImage extends Storage {
/**
* Refresh storage dirs by copying files from good storage dir
*/
- void restoreStorageDirs() throws IOException {
+ void restoreStorageDirs() {
if (!restoreRemovedDirs || getRemovedStorageDirs().isEmpty()) {
return;
}
Iterator<StorageDirectory> it = dirIterator(NameNodeDirType.EDITS);
if (!it.hasNext()) {
- throw new IOException("No healthy edits directory");
+ FSNamesystem.LOG.warn("No healthy edits directory");
+ return;
}
StorageDirectory goodSd = it.next();
File goodEdits = getEditFile(goodSd);
it = dirIterator(NameNodeDirType.IMAGE);
if (!it.hasNext()) {
- throw new IOException("No healthy fsimage directory");
+ FSNamesystem.LOG.warn("No healthy fsimage directory");
+ return;
}
goodSd = it.next();
File goodImage = getImageFile(goodSd, NameNodeFile.IMAGE);
@@ -1257,7 +1259,6 @@ public class FSImage extends Storage {
//for Hadoop version < 0.13 to fail to start
File goodImage013 = new File(goodSd.getRoot(), "image/fsimage");
- List<IOException> exceptions = new ArrayList<IOException>();
for (Iterator<StorageDirectory> i = removedStorageDirs.iterator();
i.hasNext();) {
StorageDirectory sd = i.next();
@@ -1301,13 +1302,9 @@ public class FSImage extends Storage {
} catch (IOException e) {
FSNamesystem.LOG.warn("Failed to recover removed directory "
+ sd.getRoot() + " with " + e);
- exceptions.add(e);
+ //ignore restore exception
}
}
-
- if (!exceptions.isEmpty()) {
- throw MultipleIOException.createIOException(exceptions);
- }
}
Modified: hadoop/common/branches/branch-1.0/src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.0/src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java?rev=1304161&r1=1304160&r2=1304161&view=diff
==============================================================================
--- hadoop/common/branches/branch-1.0/src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java (original)
+++ hadoop/common/branches/branch-1.0/src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java Fri Mar 23 01:41:15 2012
@@ -114,6 +114,7 @@ public class TestStorageRestore extends
* clean up
*/
public void tearDown() throws Exception {
+ restoreAccess();
if (hdfsDir.exists() && !FileUtil.fullyDelete(hdfsDir)) {
throw new IOException("Could not delete hdfs directory in tearDown '"
+ hdfsDir + "'");
@@ -136,6 +137,52 @@ public class TestStorageRestore extends
}
/**
+ * invalidate storage by removing xwr permission from name2 and name3
+ */
+ public void removeStorageAccess(FSImage fi) throws IOException {
+ path2.setReadable(false);
+ path2.setExecutable(false);
+ path2.setWritable(false);
+ path3.setReadable(false);
+ path3.setExecutable(false);
+ path3.setWritable(false);
+
+ for (Iterator<StorageDirectory> it = fi.dirIterator(); it.hasNext();) {
+ StorageDirectory sd = it.next();
+
+ if (sd.getRoot().equals(path2) || sd.getRoot().equals(path3)) {
+ fi.getEditLog().removeEditsForStorageDir(sd);
+ fi.updateRemovedDirs(sd, null);
+ it.remove();
+ }
+ }
+ }
+
+ public void restoreAccess() {
+ if (path2.exists()) {
+ path2.setReadable(true);
+ path2.setExecutable(true);
+ path2.setWritable(true);
+ }
+ if (path3.exists()) {
+ path3.setReadable(true);
+ path3.setExecutable(true);
+ path3.setWritable(true);
+ }
+ }
+
+ /**
+ * get the total number of healthy storage directories
+ */
+ public int numStorageDirs(FSImage fi) throws IOException {
+ int sum = 0;
+ for (Iterator<StorageDirectory> it = fi.dirIterator(); it.hasNext();) {
+ sum++;
+ }
+ return sum;
+ }
+
+ /**
* test
*/
public void printStorages(FSImage fs) {
@@ -236,9 +283,10 @@ public class TestStorageRestore extends
/**
* test
- * 1. create DFS cluster with 3 storage directories - 2 EDITS_IMAGE, 1 EDITS
+ * 1. create DFS cluster with 3 storage directories
+ * - 2 EDITS_IMAGE(name1, name2), 1 EDITS(name3)
* 2. create a cluster and write a file
- * 3. corrupt/disable one storage (or two) by removing
+ * 3. corrupt/disable name2 and name3 by removing subdir "current"
* 4. run doCheckpoint - it will fail on removed dirs (which will invalidate the storages)
* 5. write another file
* 6. check that edits and fsimage differ
@@ -298,4 +346,58 @@ public class TestStorageRestore extends
secondary.shutdown();
cluster.shutdown();
}
+
+ /**
+ * 1. create DFS cluster with 3 storage directories
+ * - 2 EDITS_IMAGE(name1, name2), 1 EDITS(name3)
+ * 2. create a file
+ * 3. corrupt/disable name2 and name3 by removing xwr permission
+ * 4. run doCheckpoint - it will fail on removed dirs (which will invalidate the storages)
+ * 5. write another file
+ * 6. check that edits and fsimage differ
+ * 7. run doCheckpoint - recover should fail but checkpoint should succeed
+ * 8. restore the access permission for name2 and name 3, run checkpoint again
+ * 9. verify that all the image and edits files are the same.
+ */
+ public void testStorageRestoreFailure() throws Exception {
+ int numDatanodes = 2;
+ cluster = new MiniDFSCluster(0, config, numDatanodes, true, false, true,
+ null, null, null, null);
+ cluster.waitActive();
+
+ SecondaryNameNode secondary = new SecondaryNameNode(config);
+ System.out.println("****testStorageRestore: Cluster and SNN started");
+ printStorages(cluster.getNameNode().getFSImage());
+
+ FileSystem fs = cluster.getFileSystem();
+ Path path = new Path("/", "test");
+ writeFile(fs, path, 2);
+
+ System.out
+ .println("****testStorageRestore: file test written, invalidating storage...");
+
+ removeStorageAccess(cluster.getNameNode().getFSImage());
+ printStorages(cluster.getNameNode().getFSImage());
+ System.out
+ .println("****testStorageRestore: storage invalidated + doCheckpoint");
+
+ path = new Path("/", "test1");
+ writeFile(fs, path, 2);
+ System.out.println("****testStorageRestore: file test1 written");
+ assert(numStorageDirs(cluster.getNameNode().getFSImage()) == 1);
+
+ System.out.println("****testStorageRestore: checkfiles(false) run");
+
+ secondary.doCheckpoint(); // still can't recover removed storage dirs
+ assert(numStorageDirs(cluster.getNameNode().getFSImage()) == 1);
+
+ restoreAccess();
+ secondary.doCheckpoint(); // should restore removed storage dirs
+ checkFiles(true);
+
+ System.out
+ .println("****testStorageRestore: second Checkpoint done and checkFiles(true) run");
+ secondary.shutdown();
+ cluster.shutdown();
+ }
}