You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hdfs-commits@hadoop.apache.org by to...@apache.org on 2011/06/24 23:46:03 UTC

svn commit: r1139450 - in /hadoop/common/branches/HDFS-1073/hdfs: ./ src/java/org/apache/hadoop/hdfs/server/namenode/ src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/ src/test/hdfs/org/apache/hadoop/test/

Author: todd
Date: Fri Jun 24 21:46:03 2011
New Revision: 1139450

URL: http://svn.apache.org/viewvc?rev=1139450&view=rev
Log:
HDFS-2077. Address checkpoint upload when one of the storage dirs is failed. Contributed by Todd Lipcon.

Modified:
    hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt
    hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java
    hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java
    hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java
    hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java
    hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestTransferFsImage.java
    hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java

Modified: hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt?rev=1139450&r1=1139449&r2=1139450&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt Fri Jun 24 21:46:03 2011
@@ -55,3 +55,5 @@ HDFS-2074. Determine edit log validity b
 HDFS-2085. Finalize in-progress edit logs at startup. (todd)
 HDFS-2026. SecondaryNameNode should properly handle the case where the
            NameNode is reformatted. (todd)
+HDFS-2077. Address checkpoint upload when one of the storage dirs is failed
+           (todd)

Modified: hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java?rev=1139450&r1=1139449&r2=1139450&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java Fri Jun 24 21:46:03 2011
@@ -34,6 +34,7 @@ import java.util.regex.Pattern;
 import org.apache.commons.lang.StringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.hdfs.protocol.FSConstants;
 import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
 import org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.EditLogValidation;
@@ -74,8 +75,16 @@ class FSImageTransactionalStorageInspect
     }
     
     File currentDir = sd.getCurrentDir();
+    File filesInStorage[];
+    try {
+      filesInStorage = FileUtil.listFiles(currentDir);
+    } catch (IOException ioe) {
+      LOG.warn("Unable to inspect storage directory " + currentDir,
+          ioe);
+      return;
+    }
 
-    for (File f : currentDir.listFiles()) {
+    for (File f : filesInStorage) {
       LOG.debug("Checking file " + f);
       String name = f.getName();
       

Modified: hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java?rev=1139450&r1=1139449&r2=1139450&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java Fri Jun 24 21:46:03 2011
@@ -832,7 +832,7 @@ public class NNStorage extends Storage i
    * @param sds A list of storage directories to mark as errored.
    * @throws IOException
    */
-  void reportErrorsOnDirectories(List<StorageDirectory> sds) throws IOException {
+  void reportErrorsOnDirectories(List<StorageDirectory> sds) {
     for (StorageDirectory sd : sds) {
       reportErrorsOnDirectory(sd);
     }
@@ -846,8 +846,7 @@ public class NNStorage extends Storage i
    * @param sd A storage directory to mark as errored.
    * @throws IOException
    */
-  void reportErrorsOnDirectory(StorageDirectory sd)
-      throws IOException {
+  void reportErrorsOnDirectory(StorageDirectory sd) {
     LOG.warn("Error reported on storage directory " + sd);
 
     String lsd = listStorageDirectories();
@@ -905,6 +904,29 @@ public class NNStorage extends Storage i
   }
   
   /**
+   * Report that an IOE has occurred on some file which may
+   * or may not be within one of the NN image storage directories.
+   */
+  void reportErrorOnFile(File f) {
+    // We use getAbsolutePath here instead of getCanonicalPath since we know
+    // that there is some IO problem on that drive.
+    // getCanonicalPath may need to call stat() or readlink() and it's likely
+    // those calls would fail due to the same underlying IO problem.
+    String absPath = f.getAbsolutePath();
+    for (StorageDirectory sd : storageDirs) {
+      String dirPath = sd.getRoot().getAbsolutePath();
+      if (!dirPath.endsWith("/")) {
+        dirPath += "/";
+      }
+      if (absPath.startsWith(dirPath)) {
+        reportErrorsOnDirectory(sd);
+        return;
+      }
+    }
+    
+  }
+  
+  /**
    * Generate new clusterID.
    * 
    * clusterID is a persistent attribute of the cluster.

Modified: hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java?rev=1139450&r1=1139449&r2=1139450&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java Fri Jun 24 21:46:03 2011
@@ -69,7 +69,7 @@ class TransferFsImage implements FSConst
       throw new IOException("No targets in destination storage!");
     }
     
-    MD5Hash hash = getFileClient(fsName, fileid, dstFiles, needDigest);
+    MD5Hash hash = getFileClient(fsName, fileid, dstFiles, dstStorage, needDigest);
     LOG.info("Downloaded file " + dstFiles.get(0).getName() + " size " +
         dstFiles.get(0).length() + " bytes.");
     return hash;
@@ -85,7 +85,7 @@ class TransferFsImage implements FSConst
     List<File> dstFiles = dstStorage.getFiles(NameNodeDirType.EDITS, fileName);
     assert !dstFiles.isEmpty() : "No checkpoint targets.";
 
-    getFileClient(fsName, fileid, dstFiles, false);
+    getFileClient(fsName, fileid, dstFiles, dstStorage, false);
     LOG.info("Downloaded file " + dstFiles.get(0).getName() + " size " +
         dstFiles.get(0).length() + " bytes.");
   }
@@ -107,7 +107,7 @@ class TransferFsImage implements FSConst
         txid, imageListenAddress, storage);
     // this doesn't directly upload an image, but rather asks the NN
     // to connect back to the 2NN to download the specified image.
-    TransferFsImage.getFileClient(fsName, fileid, null, false);
+    TransferFsImage.getFileClient(fsName, fileid, null, null, false);
     LOG.info("Uploaded image with txid " + txid + " to namenode at " +
     		fsName);
   }
@@ -168,12 +168,13 @@ class TransferFsImage implements FSConst
   /**
    * Client-side Method to fetch file from a server
    * Copies the response from the URL to a list of local files.
-   * 
+   * @param dstStorage if an error occurs writing to one of the files,
+   *                   this storage object will be notified. 
    * @Return a digest of the received file if getChecksum is true
    */
   static MD5Hash getFileClient(String nnHostPort,
       String queryString, List<File> localPaths,
-      boolean getChecksum) throws IOException {
+      NNStorage dstStorage, boolean getChecksum) throws IOException {
     byte[] buf = new byte[BUFFER_SIZE];
     String proto = UserGroupInformation.isSecurityEnabled() ? "https://" : "http://";
     StringBuilder str = new StringBuilder(proto+nnHostPort+"/getimage?");
@@ -215,20 +216,29 @@ class TransferFsImage implements FSConst
     }
     boolean finishedReceiving = false;
 
-    if (localPaths == null) {
-      localPaths = Collections.emptyList(); 
-    }
-    
     List<FileOutputStream> outputStreams = Lists.newArrayList();
 
     try {
-      for (File f : localPaths) {
-        if (f.exists()) {
-          LOG.warn("Overwriting existing file " + f
-              + " with file downloaded from " + str);
+      if (localPaths != null) {
+        for (File f : localPaths) {
+          try {
+            if (f.exists()) {
+              LOG.warn("Overwriting existing file " + f
+                  + " with file downloaded from " + str);
+            }
+            outputStreams.add(new FileOutputStream(f));
+          } catch (IOException ioe) {
+            LOG.warn("Unable to download file " + f, ioe);
+            dstStorage.reportErrorOnFile(f);
+          }
+        }
+        
+        if (outputStreams.isEmpty()) {
+          throw new IOException(
+              "Unable to download to any storage directory");
         }
-        outputStreams.add(new FileOutputStream(f));
       }
+      
       int num = 1;
       while (num > 0) {
         num = stream.read(buf);

Modified: hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java?rev=1139450&r1=1139449&r2=1139450&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java Fri Jun 24 21:46:03 2011
@@ -1380,6 +1380,66 @@ public class TestCheckpoint extends Test
     }  
   }
 
+  /**
+   * Test that, if a storage directory is failed when a checkpoint occurs,
+   * the non-failed storage directory receives the checkpoint.
+   */
+  @SuppressWarnings("deprecation")
+  public void testCheckpointWithFailedStorageDir() throws Exception {
+    MiniDFSCluster cluster = null;
+    SecondaryNameNode secondary = null;
+    File currentDir = null;
+    
+    Configuration conf = new HdfsConfiguration();
+
+    try {
+      cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0)
+          .format(true).build();
+  
+      secondary = startSecondaryNameNode(conf);
+
+      // Checkpoint once
+      secondary.doCheckpoint();
+
+      // Now primary NN experiences failure of a volume -- fake by
+      // setting its current dir to a-x permissions
+      NameNode nn = cluster.getNameNode();
+      NNStorage storage = nn.getFSImage().getStorage();
+      StorageDirectory sd0 = storage.getStorageDir(0);
+      StorageDirectory sd1 = storage.getStorageDir(1);
+      
+      currentDir = sd0.getCurrentDir();
+      currentDir.setExecutable(false);
+
+      // Upload checkpoint when NN has a bad storage dir. This should
+      // succeed and create the checkpoint in the good dir.
+      secondary.doCheckpoint();
+      
+      GenericTestUtils.assertExists(
+          new File(sd1.getCurrentDir(), NNStorage.getImageFileName(2)));
+      
+      // Restore the good dir
+      currentDir.setExecutable(true);
+      nn.restoreFailedStorage("true");
+      nn.rollEditLog();
+
+      // Checkpoint again -- this should upload to both dirs
+      secondary.doCheckpoint();
+      
+      assertNNHasCheckpoints(cluster, ImmutableList.of(8));
+      assertParallelFilesInvariant(cluster, ImmutableList.of(secondary));
+    } finally {
+      if (currentDir != null) {
+        currentDir.setExecutable(true);
+      }
+      if (secondary != null) {
+        secondary.shutdown();
+      }
+      if (cluster != null) {
+        cluster.shutdown();
+      }
+    }
+  }
 
   @SuppressWarnings("deprecation")
   private void cleanup(SecondaryNameNode snn) {

Modified: hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestTransferFsImage.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestTransferFsImage.java?rev=1139450&r1=1139449&r2=1139450&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestTransferFsImage.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestTransferFsImage.java Fri Jun 24 21:46:03 2011
@@ -30,34 +30,72 @@ import org.apache.hadoop.hdfs.HdfsConfig
 import org.apache.hadoop.hdfs.MiniDFSCluster;
 import org.apache.hadoop.util.StringUtils;
 import org.junit.Test;
+import org.mockito.Mockito;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
 
 
 public class TestTransferFsImage {
 
+  private static final File TEST_DIR = new File(
+      System.getProperty("test.build.data","build/test/data"));
+
   /**
    * Regression test for HDFS-1997. Test that, if an exception
-   * occurs on the client side, it is properly reported as such
+   * occurs on the client side, it is properly reported as such,
+   * and reported to the associated NNStorage object.
    */
   @Test
   public void testClientSideException() throws IOException {
-
     Configuration conf = new HdfsConfiguration();
     MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
       .numDataNodes(0).build();
+    NNStorage mockStorage = Mockito.mock(NNStorage.class);
+    List<File> localPath = Collections.<File>singletonList(
+        new File("/xxxxx-does-not-exist/blah"));
+       
     try {
-      
       String fsName = NameNode.getHostPortString(
           cluster.getNameNode().getHttpAddress());
       String id = "getimage=1&txid=0";
 
-      List<File> localPath = Collections.<File>singletonList(
-         new File("/xxxxx-does-not-exist/blah"));
-    
-      TransferFsImage.getFileClient(fsName, id, localPath, false);
+      TransferFsImage.getFileClient(fsName, id, localPath, mockStorage, false);      
       fail("Didn't get an exception!");
     } catch (IOException ioe) {
-      assertTrue("Expected FNFE, got: " + StringUtils.stringifyException(ioe),
-          ioe instanceof FileNotFoundException);
+      Mockito.verify(mockStorage).reportErrorOnFile(localPath.get(0));
+      assertTrue(
+          "Unexpected exception: " + StringUtils.stringifyException(ioe),
+          ioe.getMessage().contains("Unable to download to any storage"));
+    } finally {
+      cluster.shutdown();      
+    }
+  }
+  
+  /**
+   * Similar to the above test, except that there are multiple local files
+   * and one of them can be saved.
+   */
+  @Test
+  public void testClientSideExceptionOnJustOneDir() throws IOException {
+    Configuration conf = new HdfsConfiguration();
+    MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+      .numDataNodes(0).build();
+    NNStorage mockStorage = Mockito.mock(NNStorage.class);
+    List<File> localPaths = ImmutableList.of(
+        new File("/xxxxx-does-not-exist/blah"),
+        new File(TEST_DIR, "testfile")    
+        );
+       
+    try {
+      String fsName = NameNode.getHostPortString(
+          cluster.getNameNode().getHttpAddress());
+      String id = "getimage=1&txid=0";
+
+      TransferFsImage.getFileClient(fsName, id, localPaths, mockStorage, false);      
+      Mockito.verify(mockStorage).reportErrorOnFile(localPaths.get(0));
+      assertTrue("The valid local file should get saved properly",
+          localPaths.get(1).length() > 0);
     } finally {
       cluster.shutdown();      
     }

Modified: hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java?rev=1139450&r1=1139449&r2=1139450&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java Fri Jun 24 21:46:03 2011
@@ -17,11 +17,13 @@
  */
 package org.apache.hadoop.test;
 
+import java.io.File;
 import java.io.IOException;
 import java.util.concurrent.CountDownLatch;
 
 import org.apache.commons.logging.Log;
 import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
+import org.junit.Assert;
 import org.mockito.Mockito;
 import org.mockito.invocation.InvocationOnMock;
 import org.mockito.stubbing.Answer;
@@ -40,6 +42,13 @@ public abstract class GenericTestUtils {
   }
   
   /**
+   * Assert that a given file exists.
+   */
+  public static void assertExists(File f) {
+    Assert.assertTrue("File " + f + " should exist", f.exists());
+  }
+  
+  /**
    * Mockito answer helper that triggers one latch as soon as the
    * method is called, then waits on another before continuing.
    */