You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hdfs-commits@hadoop.apache.org by to...@apache.org on 2011/06/24 23:46:03 UTC
svn commit: r1139450 - in /hadoop/common/branches/HDFS-1073/hdfs: ./
src/java/org/apache/hadoop/hdfs/server/namenode/
src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/
src/test/hdfs/org/apache/hadoop/test/
Author: todd
Date: Fri Jun 24 21:46:03 2011
New Revision: 1139450
URL: http://svn.apache.org/viewvc?rev=1139450&view=rev
Log:
HDFS-2077. Address checkpoint upload when one of the storage dirs is failed. Contributed by Todd Lipcon.
Modified:
hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt
hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java
hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java
hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java
hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java
hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestTransferFsImage.java
hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java
Modified: hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt?rev=1139450&r1=1139449&r2=1139450&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/CHANGES.HDFS-1073.txt Fri Jun 24 21:46:03 2011
@@ -55,3 +55,5 @@ HDFS-2074. Determine edit log validity b
HDFS-2085. Finalize in-progress edit logs at startup. (todd)
HDFS-2026. SecondaryNameNode should properly handle the case where the
NameNode is reformatted. (todd)
+HDFS-2077. Address checkpoint upload when one of the storage dirs is failed
+ (todd)
Modified: hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java?rev=1139450&r1=1139449&r2=1139450&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java Fri Jun 24 21:46:03 2011
@@ -34,6 +34,7 @@ import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.EditLogValidation;
@@ -74,8 +75,16 @@ class FSImageTransactionalStorageInspect
}
File currentDir = sd.getCurrentDir();
+ File filesInStorage[];
+ try {
+ filesInStorage = FileUtil.listFiles(currentDir);
+ } catch (IOException ioe) {
+ LOG.warn("Unable to inspect storage directory " + currentDir,
+ ioe);
+ return;
+ }
- for (File f : currentDir.listFiles()) {
+ for (File f : filesInStorage) {
LOG.debug("Checking file " + f);
String name = f.getName();
Modified: hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java?rev=1139450&r1=1139449&r2=1139450&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java Fri Jun 24 21:46:03 2011
@@ -832,7 +832,7 @@ public class NNStorage extends Storage i
* @param sds A list of storage directories to mark as errored.
* @throws IOException
*/
- void reportErrorsOnDirectories(List<StorageDirectory> sds) throws IOException {
+ void reportErrorsOnDirectories(List<StorageDirectory> sds) {
for (StorageDirectory sd : sds) {
reportErrorsOnDirectory(sd);
}
@@ -846,8 +846,7 @@ public class NNStorage extends Storage i
* @param sd A storage directory to mark as errored.
* @throws IOException
*/
- void reportErrorsOnDirectory(StorageDirectory sd)
- throws IOException {
+ void reportErrorsOnDirectory(StorageDirectory sd) {
LOG.warn("Error reported on storage directory " + sd);
String lsd = listStorageDirectories();
@@ -905,6 +904,29 @@ public class NNStorage extends Storage i
}
/**
+ * Report that an IOE has occurred on some file which may
+ * or may not be within one of the NN image storage directories.
+ */
+ void reportErrorOnFile(File f) {
+ // We use getAbsolutePath here instead of getCanonicalPath since we know
+ // that there is some IO problem on that drive.
+ // getCanonicalPath may need to call stat() or readlink() and it's likely
+ // those calls would fail due to the same underlying IO problem.
+ String absPath = f.getAbsolutePath();
+ for (StorageDirectory sd : storageDirs) {
+ String dirPath = sd.getRoot().getAbsolutePath();
+ if (!dirPath.endsWith("/")) {
+ dirPath += "/";
+ }
+ if (absPath.startsWith(dirPath)) {
+ reportErrorsOnDirectory(sd);
+ return;
+ }
+ }
+
+ }
+
+ /**
* Generate new clusterID.
*
* clusterID is a persistent attribute of the cluster.
Modified: hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java?rev=1139450&r1=1139449&r2=1139450&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java Fri Jun 24 21:46:03 2011
@@ -69,7 +69,7 @@ class TransferFsImage implements FSConst
throw new IOException("No targets in destination storage!");
}
- MD5Hash hash = getFileClient(fsName, fileid, dstFiles, needDigest);
+ MD5Hash hash = getFileClient(fsName, fileid, dstFiles, dstStorage, needDigest);
LOG.info("Downloaded file " + dstFiles.get(0).getName() + " size " +
dstFiles.get(0).length() + " bytes.");
return hash;
@@ -85,7 +85,7 @@ class TransferFsImage implements FSConst
List<File> dstFiles = dstStorage.getFiles(NameNodeDirType.EDITS, fileName);
assert !dstFiles.isEmpty() : "No checkpoint targets.";
- getFileClient(fsName, fileid, dstFiles, false);
+ getFileClient(fsName, fileid, dstFiles, dstStorage, false);
LOG.info("Downloaded file " + dstFiles.get(0).getName() + " size " +
dstFiles.get(0).length() + " bytes.");
}
@@ -107,7 +107,7 @@ class TransferFsImage implements FSConst
txid, imageListenAddress, storage);
// this doesn't directly upload an image, but rather asks the NN
// to connect back to the 2NN to download the specified image.
- TransferFsImage.getFileClient(fsName, fileid, null, false);
+ TransferFsImage.getFileClient(fsName, fileid, null, null, false);
LOG.info("Uploaded image with txid " + txid + " to namenode at " +
fsName);
}
@@ -168,12 +168,13 @@ class TransferFsImage implements FSConst
/**
* Client-side Method to fetch file from a server
* Copies the response from the URL to a list of local files.
- *
+ * @param dstStorage if an error occurs writing to one of the files,
+ * this storage object will be notified.
* @Return a digest of the received file if getChecksum is true
*/
static MD5Hash getFileClient(String nnHostPort,
String queryString, List<File> localPaths,
- boolean getChecksum) throws IOException {
+ NNStorage dstStorage, boolean getChecksum) throws IOException {
byte[] buf = new byte[BUFFER_SIZE];
String proto = UserGroupInformation.isSecurityEnabled() ? "https://" : "http://";
StringBuilder str = new StringBuilder(proto+nnHostPort+"/getimage?");
@@ -215,20 +216,29 @@ class TransferFsImage implements FSConst
}
boolean finishedReceiving = false;
- if (localPaths == null) {
- localPaths = Collections.emptyList();
- }
-
List<FileOutputStream> outputStreams = Lists.newArrayList();
try {
- for (File f : localPaths) {
- if (f.exists()) {
- LOG.warn("Overwriting existing file " + f
- + " with file downloaded from " + str);
+ if (localPaths != null) {
+ for (File f : localPaths) {
+ try {
+ if (f.exists()) {
+ LOG.warn("Overwriting existing file " + f
+ + " with file downloaded from " + str);
+ }
+ outputStreams.add(new FileOutputStream(f));
+ } catch (IOException ioe) {
+ LOG.warn("Unable to download file " + f, ioe);
+ dstStorage.reportErrorOnFile(f);
+ }
+ }
+
+ if (outputStreams.isEmpty()) {
+ throw new IOException(
+ "Unable to download to any storage directory");
}
- outputStreams.add(new FileOutputStream(f));
}
+
int num = 1;
while (num > 0) {
num = stream.read(buf);
Modified: hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java?rev=1139450&r1=1139449&r2=1139450&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java Fri Jun 24 21:46:03 2011
@@ -1380,6 +1380,66 @@ public class TestCheckpoint extends Test
}
}
+ /**
+ * Test that, if a storage directory is failed when a checkpoint occurs,
+ * the non-failed storage directory receives the checkpoint.
+ */
+ @SuppressWarnings("deprecation")
+ public void testCheckpointWithFailedStorageDir() throws Exception {
+ MiniDFSCluster cluster = null;
+ SecondaryNameNode secondary = null;
+ File currentDir = null;
+
+ Configuration conf = new HdfsConfiguration();
+
+ try {
+ cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0)
+ .format(true).build();
+
+ secondary = startSecondaryNameNode(conf);
+
+ // Checkpoint once
+ secondary.doCheckpoint();
+
+ // Now primary NN experiences failure of a volume -- fake by
+ // setting its current dir to a-x permissions
+ NameNode nn = cluster.getNameNode();
+ NNStorage storage = nn.getFSImage().getStorage();
+ StorageDirectory sd0 = storage.getStorageDir(0);
+ StorageDirectory sd1 = storage.getStorageDir(1);
+
+ currentDir = sd0.getCurrentDir();
+ currentDir.setExecutable(false);
+
+ // Upload checkpoint when NN has a bad storage dir. This should
+ // succeed and create the checkpoint in the good dir.
+ secondary.doCheckpoint();
+
+ GenericTestUtils.assertExists(
+ new File(sd1.getCurrentDir(), NNStorage.getImageFileName(2)));
+
+ // Restore the good dir
+ currentDir.setExecutable(true);
+ nn.restoreFailedStorage("true");
+ nn.rollEditLog();
+
+ // Checkpoint again -- this should upload to both dirs
+ secondary.doCheckpoint();
+
+ assertNNHasCheckpoints(cluster, ImmutableList.of(8));
+ assertParallelFilesInvariant(cluster, ImmutableList.of(secondary));
+ } finally {
+ if (currentDir != null) {
+ currentDir.setExecutable(true);
+ }
+ if (secondary != null) {
+ secondary.shutdown();
+ }
+ if (cluster != null) {
+ cluster.shutdown();
+ }
+ }
+ }
@SuppressWarnings("deprecation")
private void cleanup(SecondaryNameNode snn) {
Modified: hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestTransferFsImage.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestTransferFsImage.java?rev=1139450&r1=1139449&r2=1139450&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestTransferFsImage.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestTransferFsImage.java Fri Jun 24 21:46:03 2011
@@ -30,34 +30,72 @@ import org.apache.hadoop.hdfs.HdfsConfig
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.util.StringUtils;
import org.junit.Test;
+import org.mockito.Mockito;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
public class TestTransferFsImage {
+ private static final File TEST_DIR = new File(
+ System.getProperty("test.build.data","build/test/data"));
+
/**
* Regression test for HDFS-1997. Test that, if an exception
- * occurs on the client side, it is properly reported as such
+ * occurs on the client side, it is properly reported as such,
+ * and reported to the associated NNStorage object.
*/
@Test
public void testClientSideException() throws IOException {
-
Configuration conf = new HdfsConfiguration();
MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
.numDataNodes(0).build();
+ NNStorage mockStorage = Mockito.mock(NNStorage.class);
+ List<File> localPath = Collections.<File>singletonList(
+ new File("/xxxxx-does-not-exist/blah"));
+
try {
-
String fsName = NameNode.getHostPortString(
cluster.getNameNode().getHttpAddress());
String id = "getimage=1&txid=0";
- List<File> localPath = Collections.<File>singletonList(
- new File("/xxxxx-does-not-exist/blah"));
-
- TransferFsImage.getFileClient(fsName, id, localPath, false);
+ TransferFsImage.getFileClient(fsName, id, localPath, mockStorage, false);
fail("Didn't get an exception!");
} catch (IOException ioe) {
- assertTrue("Expected FNFE, got: " + StringUtils.stringifyException(ioe),
- ioe instanceof FileNotFoundException);
+ Mockito.verify(mockStorage).reportErrorOnFile(localPath.get(0));
+ assertTrue(
+ "Unexpected exception: " + StringUtils.stringifyException(ioe),
+ ioe.getMessage().contains("Unable to download to any storage"));
+ } finally {
+ cluster.shutdown();
+ }
+ }
+
+ /**
+ * Similar to the above test, except that there are multiple local files
+ * and one of them can be saved.
+ */
+ @Test
+ public void testClientSideExceptionOnJustOneDir() throws IOException {
+ Configuration conf = new HdfsConfiguration();
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .numDataNodes(0).build();
+ NNStorage mockStorage = Mockito.mock(NNStorage.class);
+ List<File> localPaths = ImmutableList.of(
+ new File("/xxxxx-does-not-exist/blah"),
+ new File(TEST_DIR, "testfile")
+ );
+
+ try {
+ String fsName = NameNode.getHostPortString(
+ cluster.getNameNode().getHttpAddress());
+ String id = "getimage=1&txid=0";
+
+ TransferFsImage.getFileClient(fsName, id, localPaths, mockStorage, false);
+ Mockito.verify(mockStorage).reportErrorOnFile(localPaths.get(0));
+ assertTrue("The valid local file should get saved properly",
+ localPaths.get(1).length() > 0);
} finally {
cluster.shutdown();
}
Modified: hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java?rev=1139450&r1=1139449&r2=1139450&view=diff
==============================================================================
--- hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java (original)
+++ hadoop/common/branches/HDFS-1073/hdfs/src/test/hdfs/org/apache/hadoop/test/GenericTestUtils.java Fri Jun 24 21:46:03 2011
@@ -17,11 +17,13 @@
*/
package org.apache.hadoop.test;
+import java.io.File;
import java.io.IOException;
import java.util.concurrent.CountDownLatch;
import org.apache.commons.logging.Log;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
+import org.junit.Assert;
import org.mockito.Mockito;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;
@@ -40,6 +42,13 @@ public abstract class GenericTestUtils {
}
/**
+ * Assert that a given file exists.
+ */
+ public static void assertExists(File f) {
+ Assert.assertTrue("File " + f + " should exist", f.exists());
+ }
+
+ /**
* Mockito answer helper that triggers one latch as soon as the
* method is called, then waits on another before continuing.
*/