You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hdfs-commits@hadoop.apache.org by ma...@apache.org on 2011/06/30 20:38:01 UTC

svn commit: r1141658 - in /hadoop/common/trunk/hdfs: ./ src/java/org/apache/hadoop/hdfs/server/namenode/ src/test/hdfs/org/apache/hadoop/hdfs/

Author: mattf
Date: Thu Jun 30 18:38:01 2011
New Revision: 1141658

URL: http://svn.apache.org/viewvc?rev=1141658&view=rev
Log:
HDFS-1955. FSImage.doUpgrade() was made too fault-tolerant by HDFS-1826. Contributed by Matt Foley.

Modified:
    hadoop/common/trunk/hdfs/CHANGES.txt
    hadoop/common/trunk/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
    hadoop/common/trunk/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
    hadoop/common/trunk/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java
    hadoop/common/trunk/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/TestDFSUpgrade.java

Modified: hadoop/common/trunk/hdfs/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hdfs/CHANGES.txt?rev=1141658&r1=1141657&r2=1141658&view=diff
==============================================================================
--- hadoop/common/trunk/hdfs/CHANGES.txt (original)
+++ hadoop/common/trunk/hdfs/CHANGES.txt Thu Jun 30 18:38:01 2011
@@ -554,6 +554,9 @@ Trunk (unreleased changes)
 
   BUG FIXES
 
+    HDFS-1955. FSImage.doUpgrade() was made too fault-tolerant by HDFS-1826.
+    (mattf)
+
     HDFS-2061. Two minor bugs in BlockManager block report processing. (mattf)
 
     HDFS-1449. Fix test failures - ExtendedBlock must return 

Modified: hadoop/common/trunk/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java?rev=1141658&r1=1141657&r2=1141658&view=diff
==============================================================================
--- hadoop/common/trunk/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java (original)
+++ hadoop/common/trunk/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java Thu Jun 30 18:38:01 2011
@@ -1228,6 +1228,11 @@ public class FSEditLog implements NNStor
   @Override // NNStorageListener
   public synchronized void errorOccurred(StorageDirectory sd)
       throws IOException {
+    if (editStreams == null) {
+      //errors can occur on storage directories 
+      //before edit streams have been set up
+      return;
+    }
     ArrayList<EditLogOutputStream> errorStreams
       = new ArrayList<EditLogOutputStream>();
 

Modified: hadoop/common/trunk/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java?rev=1141658&r1=1141657&r2=1141658&view=diff
==============================================================================
--- hadoop/common/trunk/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java (original)
+++ hadoop/common/trunk/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java Thu Jun 30 18:38:01 2011
@@ -397,7 +397,12 @@ public class FSImage implements NNStorag
       LOG.info("Upgrade of " + sd.getRoot() + " is complete.");
     }
     isUpgradeFinalized = false;
-    storage.reportErrorsOnDirectories(errorSDs);
+    if (!errorSDs.isEmpty()) {
+      storage.reportErrorsOnDirectories(errorSDs);
+      //during upgrade, it's a fatal error to fail any storage directory
+      throw new IOException("Upgrade failed in " + errorSDs.size()
+          + " storage directory(ies), previously logged.");
+    }
     storage.initializeDistributedUpgrade();
     editLog.open();
   }

Modified: hadoop/common/trunk/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java?rev=1141658&r1=1141657&r2=1141658&view=diff
==============================================================================
--- hadoop/common/trunk/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java (original)
+++ hadoop/common/trunk/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java Thu Jun 30 18:38:01 2011
@@ -905,7 +905,7 @@ public class NNStorage extends Storage i
    */
   void reportErrorsOnDirectory(StorageDirectory sd)
       throws IOException {
-    LOG.warn("Error reported on storage directory " + sd);
+    LOG.error("Error reported on storage directory " + sd);
 
     String lsd = listStorageDirectories();
     LOG.debug("current list of storage dirs:" + lsd);
@@ -914,12 +914,12 @@ public class NNStorage extends Storage i
       listener.errorOccurred(sd);
     }
 
-    LOG.info("About to remove corresponding storage: "
+    LOG.warn("About to remove corresponding storage: "
              + sd.getRoot().getAbsolutePath());
     try {
       sd.unlock();
     } catch (Exception e) {
-      LOG.info("Unable to unlock bad storage directory: "
+      LOG.warn("Unable to unlock bad storage directory: "
                +  sd.getRoot().getPath(), e);
     }
 

Modified: hadoop/common/trunk/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/TestDFSUpgrade.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/TestDFSUpgrade.java?rev=1141658&r1=1141657&r2=1141658&view=diff
==============================================================================
--- hadoop/common/trunk/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/TestDFSUpgrade.java (original)
+++ hadoop/common/trunk/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/TestDFSUpgrade.java Thu Jun 30 18:38:01 2011
@@ -22,18 +22,19 @@ import static org.apache.hadoop.hdfs.ser
 
 import java.io.File;
 import java.io.IOException;
-
-import junit.framework.TestCase;
+import java.util.regex.Pattern;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileUtil;
-import org.apache.hadoop.hdfs.protocol.FSConstants;
 import org.apache.hadoop.hdfs.server.common.Storage;
 import org.apache.hadoop.hdfs.server.common.StorageInfo;
 import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
 import org.apache.hadoop.hdfs.server.namenode.TestParallelImageWrite;
+import org.apache.hadoop.util.StringUtils;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
 import org.junit.Test;
 import static org.junit.Assert.*;
 
@@ -44,8 +45,7 @@ import static org.junit.Assert.*;
 */
 public class TestDFSUpgrade {
  
-  private static final Log LOG = LogFactory.getLog(
-                                                   "org.apache.hadoop.hdfs.TestDFSUpgrade");
+  private static final Log LOG = LogFactory.getLog(TestDFSUpgrade.class.getName());
   private Configuration conf;
   private int testCounter = 0;
   private MiniDFSCluster cluster = null;
@@ -111,11 +111,27 @@ public class TestDFSUpgrade {
       
     }
   }
+
   /**
    * Attempts to start a NameNode with the given operation.  Starting
    * the NameNode should throw an exception.
    */
   void startNameNodeShouldFail(StartupOption operation) {
+    startNameNodeShouldFail(operation, null, null);
+  }
+
+  /**
+   * Attempts to start a NameNode with the given operation.  Starting
+   * the NameNode should throw an exception.
+   * @param operation - NameNode startup operation
+   * @param exceptionClass - if non-null, will check that the caught exception
+   *     is assignment-compatible with exceptionClass
+   * @param messagePattern - if non-null, will check that a substring of the 
+   *     message from the caught exception matches this pattern, via the
+   *     {@link Matcher#find()} method.
+   */
+  void startNameNodeShouldFail(StartupOption operation,
+      Class<? extends Exception> exceptionClass, Pattern messagePattern) {
     try {
       cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0)
                                                 .startupOption(operation)
@@ -123,9 +139,23 @@ public class TestDFSUpgrade {
                                                 .manageDataDfsDirs(false)
                                                 .manageNameDfsDirs(false)
                                                 .build(); // should fail
-      throw new AssertionError("NameNode should have failed to start");
-    } catch (Exception expected) {
-      // expected
+      fail("NameNode should have failed to start");
+      
+    } catch (Exception e) {
+      // expect exception
+      if (exceptionClass != null) {
+        assertTrue("Caught exception is not of expected class "
+            + exceptionClass.getSimpleName() + ": "
+            + StringUtils.stringifyException(e), 
+            exceptionClass.isInstance(e));
+      }
+      if (messagePattern != null) {
+        assertTrue("Caught exception message string does not match expected pattern \""
+            + messagePattern.pattern() + "\" : "
+            + StringUtils.stringifyException(e), 
+            messagePattern.matcher(e.getMessage()).find());
+      }
+      LOG.info("Successfully detected expected NameNode startup failure.");
     }
   }
   
@@ -155,6 +185,11 @@ public class TestDFSUpgrade {
                                            .build();
   }
   
+  @BeforeClass
+  public static void initialize() throws Exception {
+    UpgradeUtilities.initialize();
+  }
+  
   /**
    * This test attempts to upgrade the NameNode and DataNode under
    * a number of valid and invalid conditions.
@@ -162,8 +197,6 @@ public class TestDFSUpgrade {
   @Test
   public void testUpgrade() throws Exception {
     File[] baseDirs;
-    UpgradeUtilities.initialize();
-    
     StorageInfo storageInfo = null;
     for (int numDirs = 1; numDirs <= 2; numDirs++) {
       conf = new HdfsConfiguration();
@@ -311,6 +344,30 @@ public class TestDFSUpgrade {
       UpgradeUtilities.createEmptyDirs(nameNodeDirs);
     }
   }
+  
+  /*
+   * Stand-alone test to detect failure of one SD during parallel upgrade.
+   * At this time, can only be done with manual hack of {@link FSImage.doUpgrade()}
+   */
+  @Ignore
+  public void testUpgrade4() throws Exception {
+    int numDirs = 4;
+    conf = new HdfsConfiguration();
+    conf.setInt(DFSConfigKeys.DFS_DATANODE_SCAN_PERIOD_HOURS_KEY, -1);      
+    conf = UpgradeUtilities.initializeStorageStateConf(numDirs, conf);
+    String[] nameNodeDirs = conf.getStrings(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY);
+
+    log("NameNode upgrade with one bad storage dir", numDirs);
+    UpgradeUtilities.createNameNodeStorageDirs(nameNodeDirs, "current");
+    try {
+      // assert("storage dir has been prepared for failure before reaching this point");
+      startNameNodeShouldFail(StartupOption.UPGRADE, IOException.class,
+          Pattern.compile("failed in 1 storage"));
+    } finally {
+      // assert("storage dir shall be returned to normal state before exiting");
+      UpgradeUtilities.createEmptyDirs(nameNodeDirs);
+    }
+  }
  
   @Test(expected=IOException.class)
   public void testUpgradeFromPreUpgradeLVFails() throws IOException {
@@ -320,6 +377,7 @@ public class TestDFSUpgrade {
     fail("Expected IOException is not thrown");
   }
   
+  @Ignore
   public void test203LayoutVersion() {
     for (int lv : Storage.LAYOUT_VERSIONS_203) {
       assertTrue(Storage.is203LayoutVersion(lv));
@@ -327,7 +385,9 @@ public class TestDFSUpgrade {
   }
   
   public static void main(String[] args) throws Exception {
-    new TestDFSUpgrade().testUpgrade();
+    TestDFSUpgrade t = new TestDFSUpgrade();
+    TestDFSUpgrade.initialize();
+    t.testUpgrade();
   }
 }