You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by sz...@apache.org on 2012/03/23 02:32:34 UTC

svn commit: r1304158 - in /hadoop/common/branches/branch-1: CHANGES.txt src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java

Author: szetszwo
Date: Fri Mar 23 01:32:33 2012
New Revision: 1304158

URL: http://svn.apache.org/viewvc?rev=1304158&view=rev
Log:
HDFS-3127. Do not throw exceptions when FSImage.restoreStorageDirs() failes. Contributed by Brandon Li

Modified:
    hadoop/common/branches/branch-1/CHANGES.txt
    hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java
    hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java

Modified: hadoop/common/branches/branch-1/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/CHANGES.txt?rev=1304158&r1=1304157&r2=1304158&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/CHANGES.txt (original)
+++ hadoop/common/branches/branch-1/CHANGES.txt Fri Mar 23 01:32:33 2012
@@ -235,6 +235,9 @@ Release 1.0.2 - 2012.03.18
     HADOOP-8132. 64bit secure datanodes do not start as the jsvc path is wrong
     (Arpit Gupta via mattf)
 
+    HDFS-3127. Do not throw exceptions when FSImage.restoreStorageDirs() failes.
+    (Brandon Li via szetszwo)
+
 Release 1.0.1 - 2012.02.14
 
   NEW FEATURES

Modified: hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java?rev=1304158&r1=1304157&r2=1304158&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java (original)
+++ hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java Fri Mar 23 01:32:33 2012
@@ -1236,21 +1236,23 @@ public class FSImage extends Storage {
   /** 
    * Refresh storage dirs by copying files from good storage dir
    */
-  void restoreStorageDirs() throws IOException {
+  void restoreStorageDirs() {
     if (!restoreRemovedDirs || getRemovedStorageDirs().isEmpty()) {
       return;
     }
     
     Iterator<StorageDirectory> it = dirIterator(NameNodeDirType.EDITS);
     if (!it.hasNext()) {
-      throw new IOException("No healthy edits directory");
+      FSNamesystem.LOG.warn("No healthy edits directory");
+      return;
     }
     StorageDirectory goodSd = it.next();
     File goodEdits = getEditFile(goodSd);
 
     it = dirIterator(NameNodeDirType.IMAGE);
     if (!it.hasNext()) {
-      throw new IOException("No healthy fsimage directory");
+      FSNamesystem.LOG.warn("No healthy fsimage directory");
+      return;
     }
     goodSd = it.next();
     File goodImage = getImageFile(goodSd, NameNodeFile.IMAGE);
@@ -1259,7 +1261,6 @@ public class FSImage extends Storage {
     //for Hadoop version < 0.13 to fail to start
     File goodImage013 = new File(goodSd.getRoot(), "image/fsimage");
 
-    List<IOException> exceptions = new ArrayList<IOException>();
     for (Iterator<StorageDirectory> i = removedStorageDirs.iterator();
         i.hasNext();) {
       StorageDirectory sd = i.next();
@@ -1303,13 +1304,9 @@ public class FSImage extends Storage {
       } catch (IOException e) {
         FSNamesystem.LOG.warn("Failed to recover removed directory "
             + sd.getRoot() + " with " + e);
-        exceptions.add(e);
+        //ignore restore exception
       }
     }
-    
-    if (!exceptions.isEmpty()) {
-      throw MultipleIOException.createIOException(exceptions);
-    }
   }
   
   

Modified: hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java?rev=1304158&r1=1304157&r2=1304158&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java (original)
+++ hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java Fri Mar 23 01:32:33 2012
@@ -114,6 +114,7 @@ public class TestStorageRestore extends 
    * clean up
    */
   public void tearDown() throws Exception {
+    restoreAccess();    
     if (hdfsDir.exists() && !FileUtil.fullyDelete(hdfsDir)) {
       throw new IOException("Could not delete hdfs directory in tearDown '"
           + hdfsDir + "'");
@@ -136,6 +137,52 @@ public class TestStorageRestore extends 
   }
 
   /**
+   * invalidate storage by removing xwr permission from name2 and name3
+   */
+  public void removeStorageAccess(FSImage fi) throws IOException {
+    path2.setReadable(false);
+    path2.setExecutable(false);
+    path2.setWritable(false);
+    path3.setReadable(false);
+    path3.setExecutable(false);
+    path3.setWritable(false);
+    
+    for (Iterator<StorageDirectory> it = fi.dirIterator(); it.hasNext();) {
+      StorageDirectory sd = it.next();
+      
+      if (sd.getRoot().equals(path2) || sd.getRoot().equals(path3)) {
+        fi.getEditLog().removeEditsForStorageDir(sd);
+        fi.updateRemovedDirs(sd, null);
+        it.remove();
+      }
+    }
+  }
+  
+  public void restoreAccess() {
+    if (path2.exists()) {
+      path2.setReadable(true);
+      path2.setExecutable(true);
+      path2.setWritable(true);
+    }
+    if (path3.exists()) {
+      path3.setReadable(true);    
+      path3.setExecutable(true);
+      path3.setWritable(true);
+    }
+  }
+  
+  /**
+   * get the total number of healthy storage directories
+   */
+  public int numStorageDirs(FSImage fi) throws IOException {
+    int sum = 0;
+    for (Iterator<StorageDirectory> it = fi.dirIterator(); it.hasNext();) {
+      sum++;
+    }
+    return sum;
+  }
+  
+  /**
    * test
    */
   public void printStorages(FSImage fs) {
@@ -236,9 +283,10 @@ public class TestStorageRestore extends 
 
   /**
    * test 
-   * 1. create DFS cluster with 3 storage directories - 2 EDITS_IMAGE, 1 EDITS 
+   * 1. create DFS cluster with 3 storage directories
+   *    - 2 EDITS_IMAGE(name1, name2), 1 EDITS(name3)
    * 2. create a cluster and write a file 
-   * 3. corrupt/disable one storage (or two) by removing 
+   * 3. corrupt/disable name2 and name3 by removing subdir "current"  
    * 4. run doCheckpoint - it will fail on removed dirs (which will invalidate the storages)
    * 5. write another file 
    * 6. check that edits and fsimage differ 
@@ -298,4 +346,58 @@ public class TestStorageRestore extends 
     secondary.shutdown();
     cluster.shutdown();
   }
+
+  /**
+   * 1. create DFS cluster with 3 storage directories
+   *    - 2 EDITS_IMAGE(name1, name2), 1 EDITS(name3)
+   * 2. create a file
+   * 3. corrupt/disable name2 and name3 by removing xwr permission
+   * 4. run doCheckpoint - it will fail on removed dirs (which will invalidate the storages)
+   * 5. write another file
+   * 6. check that edits and fsimage differ
+   * 7. run doCheckpoint - recover should fail but checkpoint should succeed 
+   * 8. restore the access permission for name2 and name 3, run checkpoint again
+   * 9. verify that all the image and edits files are the same.
+   */
+  public void testStorageRestoreFailure() throws Exception {
+    int numDatanodes = 2;
+    cluster = new MiniDFSCluster(0, config, numDatanodes, true, false, true,
+        null, null, null, null);
+    cluster.waitActive();
+
+    SecondaryNameNode secondary = new SecondaryNameNode(config);
+    System.out.println("****testStorageRestore: Cluster and SNN started");
+    printStorages(cluster.getNameNode().getFSImage());
+
+    FileSystem fs = cluster.getFileSystem();
+    Path path = new Path("/", "test");
+    writeFile(fs, path, 2);
+
+    System.out
+        .println("****testStorageRestore: file test written, invalidating storage...");
+
+    removeStorageAccess(cluster.getNameNode().getFSImage());
+    printStorages(cluster.getNameNode().getFSImage());
+    System.out
+        .println("****testStorageRestore: storage invalidated + doCheckpoint");
+
+    path = new Path("/", "test1");
+    writeFile(fs, path, 2);
+    System.out.println("****testStorageRestore: file test1 written");
+    assert(numStorageDirs(cluster.getNameNode().getFSImage()) == 1);
+
+    System.out.println("****testStorageRestore: checkfiles(false) run");
+
+    secondary.doCheckpoint(); // still can't recover removed storage dirs
+    assert(numStorageDirs(cluster.getNameNode().getFSImage()) == 1);
+
+    restoreAccess();
+    secondary.doCheckpoint(); // should restore removed storage dirs
+    checkFiles(true);
+
+    System.out
+        .println("****testStorageRestore: second Checkpoint done and checkFiles(true) run");
+    secondary.shutdown();
+    cluster.shutdown();
+  }
 }