You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by su...@apache.org on 2012/03/15 00:54:11 UTC
svn commit: r1300791 - in /hadoop/common/branches/branch-1.0: ./
src/hdfs/org/apache/hadoop/hdfs/server/namenode/
src/test/org/apache/hadoop/hdfs/server/namenode/
Author: suresh
Date: Wed Mar 14 23:54:10 2012
New Revision: 1300791
URL: http://svn.apache.org/viewvc?rev=1300791&view=rev
Log:
HDFS-3075. Merging r1300680 and other dependencies HDFS-2701:r1221098, HDFS-2703:r1221099, HDFS-2702:r1221100
Added:
hadoop/common/branches/branch-1.0/src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageDirectoryFailure.java
- copied unchanged from r1221100, hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageDirectoryFailure.java
hadoop/common/branches/branch-1.0/src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java
- copied unchanged from r1300680, hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java
Modified:
hadoop/common/branches/branch-1.0/CHANGES.txt
hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java
hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java
hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java
Modified: hadoop/common/branches/branch-1.0/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.0/CHANGES.txt?rev=1300791&r1=1300790&r2=1300791&view=diff
==============================================================================
--- hadoop/common/branches/branch-1.0/CHANGES.txt (original)
+++ hadoop/common/branches/branch-1.0/CHANGES.txt Wed Mar 14 23:54:10 2012
@@ -1,5 +1,8 @@
Hadoop Change Log
+ HDFS-3075. Backport HADOOP-4885: Try to restore failed name-node storage
+ directories at checkpoint time. (Brandon Li via szetszwo)
+
Release 1.0.2 - unreleased
NEW FEATURES
@@ -7,6 +10,8 @@ Release 1.0.2 - unreleased
HADOOP-7206. Support Snappy compression. (Issei Yoshida and
Alejandro Abdelnur via vinodkv).
+ HDFS-2701. Cleanup FS* processIOError methods. (eli)
+
HDFS-2978. The NameNode should expose name dir statuses via JMX. (atm)
IMPROVEMENTS
@@ -36,6 +41,11 @@ Release 1.0.2 - unreleased
MAPREDUCE-764. Fix TypedBytesInput.readRaw to preserve custom type codes.
(Klaas Bosteels via acmurthy)
+ HDFS-2703. removedStorageDirs is not updated everywhere we remove
+ a storage dir. (eli)
+
+ HDFS-2702. A single failed name dir can cause the NN to exit. (eli)
+
Release 1.0.1 - 2012.02.14
NEW FEATURES
Modified: hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java?rev=1300791&r1=1300790&r2=1300791&view=diff
==============================================================================
--- hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java (original)
+++ hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java Wed Mar 14 23:54:10 2012
@@ -69,6 +69,9 @@ class FSDirectory implements FSConstants
ns.createFsOwnerPermissions(new FsPermission((short)0755)),
Integer.MAX_VALUE, -1);
this.fsImage = fsImage;
+ fsImage.setRestoreRemovedDirs(conf.getBoolean(
+ DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_KEY,
+ DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_DEFAULT));
namesystem = ns;
int configuredLimit = conf.getInt(
DFSConfigKeys.DFS_LIST_LIMIT, DFSConfigKeys.DFS_LIST_LIMIT_DEFAULT);
Modified: hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java?rev=1300791&r1=1300790&r2=1300791&view=diff
==============================================================================
--- hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java (original)
+++ hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java Wed Mar 14 23:54:10 2012
@@ -304,11 +304,13 @@ public class FSEditLog {
}
private int getNumStorageDirs() {
- int numStorageDirs = 0;
- for (Iterator<StorageDirectory> it =
- fsimage.dirIterator(NameNodeDirType.EDITS); it.hasNext(); it.next())
- numStorageDirs++;
- return numStorageDirs;
+ int numStorageDirs = 0;
+ Iterator<StorageDirectory> it = fsimage.dirIterator(NameNodeDirType.EDITS);
+ while (it.hasNext()) {
+ numStorageDirs++;
+ it.next();
+ }
+ return numStorageDirs;
}
synchronized int getNumEditStreams() {
@@ -327,21 +329,22 @@ public class FSEditLog {
*/
public synchronized void open() throws IOException {
numTransactions = totalTimeTransactions = numTransactionsBatchedInSync = 0;
- if (editStreams == null)
+ if (editStreams == null) {
editStreams = new ArrayList<EditLogOutputStream>();
- for (Iterator<StorageDirectory> it =
- fsimage.dirIterator(NameNodeDirType.EDITS); it.hasNext();) {
+ }
+ Iterator<StorageDirectory> it = fsimage.dirIterator(NameNodeDirType.EDITS);
+ while (it.hasNext()) {
StorageDirectory sd = it.next();
File eFile = getEditFile(sd);
try {
EditLogOutputStream eStream = new EditLogFileOutputStream(eFile);
editStreams.add(eStream);
- } catch (IOException e) {
- FSNamesystem.LOG.warn("Unable to open edit log file " + eFile);
- // Remove the directory from list of storage directories
+ } catch (IOException ioe) {
+ fsimage.updateRemovedDirs(sd, ioe);
it.remove();
}
}
+ exitIfNoStreams();
}
public synchronized void createEditLogFile(File name) throws IOException {
@@ -372,84 +375,94 @@ public class FSEditLog {
eStream.setReadyToFlush();
eStream.flush();
eStream.close();
- } catch (IOException e) {
- processIOError(idx);
+ } catch (IOException ioe) {
+ removeEditsAndStorageDir(idx);
idx--;
}
}
editStreams.clear();
}
+ void fatalExit(String msg) {
+ FSNamesystem.LOG.fatal(msg, new Exception(msg));
+ Runtime.getRuntime().exit(-1);
+ }
+
/**
- * If there is an IO Error on any log operations, remove that
- * directory from the list of directories.
- * If no more directories remain, then exit.
- */
- synchronized void processIOError(int index) {
- if (editStreams == null || editStreams.size() <= 1) {
- FSNamesystem.LOG.fatal(
- "Fatal Error : All storage directories are inaccessible.");
- Runtime.getRuntime().exit(-1);
+ * Exit the NN process if the edit streams have not yet been
+ * initialized, eg we failed while opening.
+ */
+ private void exitIfStreamsNotSet() {
+ if (editStreams == null) {
+ fatalExit("Edit streams not yet initialized");
}
- assert(index < getNumStorageDirs());
- assert(getNumStorageDirs() == editStreams.size());
+ }
+
+ /**
+ * Exit the NN process if there are no edit streams to log to.
+ */
+ void exitIfNoStreams() {
+ if (editStreams == null || editStreams.isEmpty()) {
+ fatalExit("No edit streams are accessible");
+ }
+ }
+
+ /**
+ * @return the storage directory for the given edit stream.
+ */
+ private File getStorageDirForStream(int idx) {
+ File editsFile =
+ ((EditLogFileOutputStream)editStreams.get(idx)).getFile();
+ // Namedir is the parent of current which is the parent of edits
+ return editsFile.getParentFile().getParentFile();
+ }
+
+ /**
+ * Remove the given edits stream and its containing storage dir.
+ */
+ synchronized void removeEditsAndStorageDir(int idx) {
+ exitIfStreamsNotSet();
+
+ assert idx < getNumStorageDirs();
+ assert getNumStorageDirs() == editStreams.size();
- File parentStorageDir = ((EditLogFileOutputStream)editStreams
- .get(index)).getFile()
- .getParentFile().getParentFile();
- editStreams.remove(index);
- //
- // Invoke the ioerror routine of the fsimage
- //
- fsimage.processIOError(parentStorageDir);
+ File dir = getStorageDirForStream(idx);
+ editStreams.remove(idx);
+ fsimage.removeStorageDir(dir);
}
-
+
/**
- * If there is an IO Error on any log operations on storage directory,
- * remove any stream associated with that directory
+ * Remove all edits streams for the given storage directory.
*/
- synchronized void processIOError(StorageDirectory sd) {
- // Try to remove stream only if one should exist
- if (!sd.getStorageDirType().isOfType(NameNodeDirType.EDITS))
+ synchronized void removeEditsForStorageDir(StorageDirectory sd) {
+ exitIfStreamsNotSet();
+
+ if (!sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) {
return;
- if (editStreams == null || editStreams.size() <= 1) {
- FSNamesystem.LOG.fatal(
- "Fatal Error : All storage directories are inaccessible.");
- Runtime.getRuntime().exit(-1);
}
for (int idx = 0; idx < editStreams.size(); idx++) {
- File parentStorageDir = ((EditLogFileOutputStream)editStreams
- .get(idx)).getFile()
- .getParentFile().getParentFile();
- if (parentStorageDir.getName().equals(sd.getRoot().getName()))
+ File parentDir = getStorageDirForStream(idx);
+ if (parentDir.getName().equals(sd.getRoot().getName())) {
editStreams.remove(idx);
- }
+ }
+ }
}
/**
- * The specified streams have IO errors. Remove them from logging
- * new transactions.
+ * Remove each of the given edits streams and their corresponding
+ * storage directories.
*/
- private void processIOError(ArrayList<EditLogOutputStream> errorStreams) {
+ private void removeEditsStreamsAndStorageDirs(
+ ArrayList<EditLogOutputStream> errorStreams) {
if (errorStreams == null) {
- return; // nothing to do
+ return;
}
- for (int idx = 0; idx < errorStreams.size(); idx++) {
- EditLogOutputStream eStream = errorStreams.get(idx);
- int j = 0;
- int numEditStreams = editStreams.size();
- for (j = 0; j < numEditStreams; j++) {
- if (editStreams.get(j) == eStream) {
- break;
- }
- }
- if (j == numEditStreams) {
- FSNamesystem.LOG.error("Unable to find sync log on which " +
- " IO error occured. " +
- "Fatal Error.");
- Runtime.getRuntime().exit(-1);
+ for (EditLogOutputStream errorStream : errorStreams) {
+ int idx = editStreams.indexOf(errorStream);
+ if (-1 == idx) {
+ fatalExit("Unable to find edits stream with IO error");
}
- processIOError(j);
+ removeEditsAndStorageDir(idx);
}
fsimage.incrementCheckpointTime();
}
@@ -458,8 +471,8 @@ public class FSEditLog {
* check if ANY edits.new log exists
*/
boolean existsNew() throws IOException {
- for (Iterator<StorageDirectory> it =
- fsimage.dirIterator(NameNodeDirType.EDITS); it.hasNext();) {
+ Iterator<StorageDirectory> it = fsimage.dirIterator(NameNodeDirType.EDITS);
+ while (it.hasNext()) {
if (getEditNewFile(it.next()).exists()) {
return true;
}
@@ -903,19 +916,20 @@ public class FSEditLog {
* store yet.
*/
synchronized void logEdit(byte op, Writable ... writables) {
- assert this.getNumEditStreams() > 0 : "no editlog streams";
+ if (getNumEditStreams() < 1) {
+ throw new AssertionError("No edit streams to log to");
+ }
long start = FSNamesystem.now();
for (int idx = 0; idx < editStreams.size(); idx++) {
EditLogOutputStream eStream = editStreams.get(idx);
try {
eStream.write(op, writables);
- } catch (IOException ie) {
- processIOError(idx);
- // processIOError will remove the idx's stream
- // from the editStreams collection, so we need to update idx
+ } catch (IOException ioe) {
+ removeEditsAndStorageDir(idx);
idx--;
}
}
+ exitIfNoStreams();
// get a new transactionId
txid++;
@@ -983,7 +997,7 @@ public class FSEditLog {
EditLogOutputStream eStream = editStreams.get(idx);
try {
eStream.flush();
- } catch (IOException ie) {
+ } catch (IOException ioe) {
//
// remember the streams that encountered an error.
//
@@ -991,14 +1005,14 @@ public class FSEditLog {
errorStreams = new ArrayList<EditLogOutputStream>(1);
}
errorStreams.add(eStream);
- FSNamesystem.LOG.error("Unable to sync edit log. " +
- "Fatal Error.");
+ FSNamesystem.LOG.error("Unable to sync "+eStream.getName());
}
}
long elapsed = FSNamesystem.now() - start;
synchronized (this) {
- processIOError(errorStreams);
+ removeEditsStreamsAndStorageDirs(errorStreams);
+ exitIfNoStreams();
synctxid = syncStart;
isSyncRunning = false;
this.notifyAll();
@@ -1217,36 +1231,46 @@ public class FSEditLog {
// exists in all directories.
//
if (existsNew()) {
- for (Iterator<StorageDirectory> it =
- fsimage.dirIterator(NameNodeDirType.EDITS); it.hasNext();) {
+ Iterator<StorageDirectory> it =
+ fsimage.dirIterator(NameNodeDirType.EDITS);
+ StringBuilder b = new StringBuilder();
+ while (it.hasNext()) {
File editsNew = getEditNewFile(it.next());
- if (!editsNew.exists()) {
- throw new IOException("Inconsistent existance of edits.new " +
- editsNew);
+ b.append("\n ").append(editsNew);
+ if (!editsNew.exists()) {
+ throw new IOException(
+ "Inconsistent existence of edits.new " + editsNew);
}
}
- return; // nothing to do, edits.new exists!
+ FSNamesystem.LOG.warn("Cannot roll edit log," +
+ " edits.new files already exists in all healthy directories:" + b);
+ return;
}
- close(); // close existing edit log
+ close(); // close existing edit log
+ // After edit streams are closed, healthy edits files should be identical,
+ // and same to fsimage files
+ fsimage.restoreStorageDirs();
+
//
// Open edits.new
//
- for (Iterator<StorageDirectory> it =
- fsimage.dirIterator(NameNodeDirType.EDITS); it.hasNext();) {
+ Iterator<StorageDirectory> it = fsimage.dirIterator(NameNodeDirType.EDITS);
+ while (it.hasNext()) {
StorageDirectory sd = it.next();
try {
EditLogFileOutputStream eStream =
new EditLogFileOutputStream(getEditNewFile(sd));
eStream.create();
editStreams.add(eStream);
- } catch (IOException e) {
- // remove stream and this storage directory from list
- processIOError(sd);
- it.remove();
+ } catch (IOException ioe) {
+ removeEditsForStorageDir(sd);
+ fsimage.updateRemovedDirs(sd, ioe);
+ it.remove();
}
}
+ exitIfNoStreams();
}
/**
@@ -1266,8 +1290,8 @@ public class FSEditLog {
//
// Delete edits and rename edits.new to edits.
//
- for (Iterator<StorageDirectory> it =
- fsimage.dirIterator(NameNodeDirType.EDITS); it.hasNext();) {
+ Iterator<StorageDirectory> it = fsimage.dirIterator(NameNodeDirType.EDITS);
+ while (it.hasNext()) {
StorageDirectory sd = it.next();
if (!getEditNewFile(sd).renameTo(getEditFile(sd))) {
//
@@ -1276,8 +1300,10 @@ public class FSEditLog {
//
getEditFile(sd).delete();
if (!getEditNewFile(sd).renameTo(getEditFile(sd))) {
- // Should we also remove from edits
- it.remove();
+ sd.unlock();
+ removeEditsForStorageDir(sd);
+ fsimage.updateRemovedDirs(sd, null);
+ it.remove();
}
}
}
Modified: hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java?rev=1300791&r1=1300790&r2=1300791&view=diff
==============================================================================
--- hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java (original)
+++ hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java Wed Mar 14 23:54:10 2012
@@ -28,39 +28,41 @@ import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
+import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import java.util.Properties;
import java.util.Random;
-import java.util.Map;
-import java.util.HashMap;
-import java.lang.Math;
-import java.nio.ByteBuffer;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.permission.PermissionStatus;
import org.apache.hadoop.fs.permission.FsPermission;
-import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.permission.PermissionStatus;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.NodeType;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
-import org.apache.hadoop.io.UTF8;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.hdfs.server.namenode.NameNode;
-import org.apache.hadoop.hdfs.server.namenode.BlocksMap.BlockInfo;
-import org.apache.hadoop.hdfs.server.namenode.FSEditLog.EditLogFileInputStream;
import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.common.StorageInfo;
import org.apache.hadoop.hdfs.server.common.UpgradeManager;
+import org.apache.hadoop.hdfs.server.namenode.BlocksMap.BlockInfo;
+import org.apache.hadoop.hdfs.server.namenode.FSEditLog.EditLogFileInputStream;
import org.apache.hadoop.hdfs.util.AtomicFileOutputStream;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.MultipleIOException;
+import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Writable;
/**
* FSImage handles checkpointing and logging of the namespace edits.
@@ -116,9 +118,10 @@ public class FSImage extends Storage {
private boolean isUpgradeFinalized = false;
/**
- * list of failed (and thus removed) storages
+ * List of failed (and thus removed) storages
*/
- protected List<StorageDirectory> removedStorageDirs = new ArrayList<StorageDirectory>();
+ private List<StorageDirectory> removedStorageDirs
+ = new ArrayList<StorageDirectory>();
/**
* Directories for importing an image from a checkpoint.
@@ -137,6 +140,9 @@ public class FSImage extends Storage {
static private final FsPermission FILE_PERM = new FsPermission((short)0);
static private final byte[] PATH_SEPARATOR = DFSUtil.string2Bytes(Path.SEPARATOR);
+ /** Flag to restore removed storage directories at checkpointing */
+ private boolean restoreRemovedDirs = DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_DEFAULT;
+
/**
*/
FSImage() {
@@ -171,9 +177,9 @@ public class FSImage extends Storage {
void setStorageDirectories(Collection<File> fsNameDirs,
Collection<File> fsEditsDirs
) throws IOException {
- this.storageDirs = new ArrayList<StorageDirectory>();
- this.removedStorageDirs = new ArrayList<StorageDirectory>();
- // Add all name dirs with appropriate NameNodeDirType
+ storageDirs = new ArrayList<StorageDirectory>();
+ removedStorageDirs = new ArrayList<StorageDirectory>();
+ // Add all name dirs with appropriate NameNodeDirType
for (File dirName : fsNameDirs) {
boolean isAlsoEdits = false;
for (File editsDirName : fsEditsDirs) {
@@ -186,13 +192,12 @@ public class FSImage extends Storage {
NameNodeDirType dirType = (isAlsoEdits) ?
NameNodeDirType.IMAGE_AND_EDITS :
NameNodeDirType.IMAGE;
- this.addStorageDir(new StorageDirectory(dirName, dirType));
+ addStorageDir(new StorageDirectory(dirName, dirType));
}
// Add edits dirs if they are different from name dirs
for (File dirName : fsEditsDirs) {
- this.addStorageDir(new StorageDirectory(dirName,
- NameNodeDirType.EDITS));
+ addStorageDir(new StorageDirectory(dirName, NameNodeDirType.EDITS));
}
}
@@ -207,9 +212,14 @@ public class FSImage extends Storage {
}
List<StorageDirectory> getRemovedStorageDirs() {
- return this.removedStorageDirs;
+ return removedStorageDirs;
}
-
+
+ void updateRemovedDirs(StorageDirectory sd, IOException ioe) {
+ LOG.warn("Removing storage dir " + sd.getRoot().getPath(), ioe);
+ removedStorageDirs.add(sd);
+ }
+
File getEditFile(StorageDirectory sd) {
return getImageFile(sd, NameNodeFile.EDITS);
}
@@ -604,45 +614,37 @@ public class FSImage extends Storage {
}
/**
- * Record new checkpoint time in order to
+ * Record new checkpoint time in each storage dir in order to
* distinguish healthy directories from the removed ones.
* If there is an error writing new checkpoint time, the corresponding
* storage directory is removed from the list.
*/
void incrementCheckpointTime() {
this.checkpointTime++;
-
- // Write new checkpoint time in all storage directories
- for(Iterator<StorageDirectory> it =
- dirIterator(); it.hasNext();) {
+
+ Iterator<StorageDirectory> it = dirIterator();
+ while (it.hasNext()) {
StorageDirectory sd = it.next();
try {
writeCheckpointTime(sd);
- } catch(IOException e) {
- // Close any edits stream associated with this dir and remove directory
- if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) {
- editLog.processIOError(sd);
- }
-
- //add storage to the removed list
- removedStorageDirs.add(sd);
+ } catch (IOException ioe) {
+ editLog.removeEditsForStorageDir(sd);
+ updateRemovedDirs(sd, ioe);
it.remove();
}
}
+ editLog.exitIfNoStreams();
}
/**
- * Remove storage directory given directory
+ * Remove the given storage directory.
*/
-
- void processIOError(File dirName) {
- for (Iterator<StorageDirectory> it =
- dirIterator(); it.hasNext();) {
+ void removeStorageDir(File dir) {
+ Iterator<StorageDirectory> it = dirIterator();
+ while (it.hasNext()) {
StorageDirectory sd = it.next();
- if (sd.getRoot().getPath().equals(dirName.getPath())) {
- //add storage to the removed list
- LOG.info(" removing " + dirName.getPath());
- removedStorageDirs.add(sd);
+ if (sd.getRoot().getPath().equals(dir.getPath())) {
+ updateRemovedDirs(sd, null);
it.remove();
}
}
@@ -652,6 +654,11 @@ public class FSImage extends Storage {
return editLog;
}
+ /** Testing hook */
+ public void setEditLog(FSEditLog newLog) {
+ editLog = newLog;
+ }
+
public boolean isConversionNeeded(StorageDirectory sd) throws IOException {
File oldImageDir = new File(sd.getRoot(), "image");
if (!oldImageDir.exists()) {
@@ -1080,7 +1087,7 @@ public class FSImage extends Storage {
moveCurrent(sd);
} catch(IOException ie) {
LOG.error("Unable to move current for " + sd.getRoot(), ie);
- processIOError(sd.getRoot());
+ removeStorageDir(sd.getRoot());
}
}
@@ -1092,7 +1099,7 @@ public class FSImage extends Storage {
saveCurrent(sd);
} catch(IOException ie) {
LOG.error("Unable to save image for " + sd.getRoot(), ie);
- processIOError(sd.getRoot());
+ removeStorageDir(sd.getRoot());
}
}
@@ -1114,7 +1121,7 @@ public class FSImage extends Storage {
saveCurrent(sd);
} catch(IOException ie) {
LOG.error("Unable to save edits for " + sd.getRoot(), ie);
- processIOError(sd.getRoot());
+ removeStorageDir(sd.getRoot());
}
}
// mv lastcheckpoint.tmp -> previous.checkpoint
@@ -1124,7 +1131,7 @@ public class FSImage extends Storage {
moveLastCheckpoint(sd);
} catch(IOException ie) {
LOG.error("Unable to move last checkpoint for " + sd.getRoot(), ie);
- processIOError(sd.getRoot());
+ removeStorageDir(sd.getRoot());
}
}
if(!editLog.isOpen()) editLog.open();
@@ -1211,7 +1218,99 @@ public class FSImage extends Storage {
newID = r.nextInt(0x7FFFFFFF); // use 31 bits only
return newID;
}
+
+ void setRestoreRemovedDirs(boolean allow) {
+ this.restoreRemovedDirs = allow;
+ }
+
+ /** restore a metadata file */
+ private static void restoreFile(File src, File dstdir, String dstfile)
+ throws IOException {
+ File dst = new File(dstdir, dstfile);
+ IOUtils.copyBytes(new FileInputStream(src), new FileOutputStream(dst),
+ DFSConfigKeys.DFS_STREAM_BUFFER_SIZE_DEFAULT, true);
+ }
+ /**
+ * Refresh storage dirs by copying files from good storage dir
+ */
+ void restoreStorageDirs() throws IOException {
+ if (!restoreRemovedDirs || getRemovedStorageDirs().isEmpty()) {
+ return;
+ }
+
+ Iterator<StorageDirectory> it = dirIterator(NameNodeDirType.EDITS);
+ if (!it.hasNext()) {
+ throw new IOException("No healthy edits directory");
+ }
+ StorageDirectory goodSd = it.next();
+ File goodEdits = getEditFile(goodSd);
+
+ it = dirIterator(NameNodeDirType.IMAGE);
+ if (!it.hasNext()) {
+ throw new IOException("No healthy fsimage directory");
+ }
+ goodSd = it.next();
+ File goodImage = getImageFile(goodSd, NameNodeFile.IMAGE);
+ File goodFstime = getImageFile(goodSd, NameNodeFile.TIME);
+ File goodVersion = goodSd.getVersionFile();
+ //for Hadoop version < 0.13 to fail to start
+ File goodImage013 = new File(goodSd.getRoot(), "image/fsimage");
+
+ List<IOException> exceptions = new ArrayList<IOException>();
+ for (Iterator<StorageDirectory> i = removedStorageDirs.iterator();
+ i.hasNext();) {
+ StorageDirectory sd = i.next();
+ FSNamesystem.LOG.info("Try to recover removed directory " + sd.getRoot()
+ + " by reformatting");
+ try {
+ // don't create dir if it doesn't exist, since it may should be mounted
+ if (!sd.getRoot().exists()) {
+ throw new IOException("Directory " + sd.getRoot() + "doesn't exist");
+ }
+ if (!FileUtil.fullyDeleteContents(sd.getRoot())) {
+ throw new IOException("Can't fully delete content of " + sd.getRoot());
+ }
+ sd.clearDirectory(); // create empty "current" dir
+ restoreFile(goodVersion, sd.getCurrentDir(), Storage.STORAGE_FILE_VERSION);
+ restoreFile(goodFstime, sd.getCurrentDir(), NameNodeFile.TIME.getName());
+
+ // Create image directory
+ File imageDir = new File(sd.getRoot(), "image");
+ if (!imageDir.mkdir()) {
+ throw new IOException("Can't make directory 'image'.");
+ }
+ restoreFile(goodImage013, imageDir, NameNodeFile.IMAGE.getName());
+
+ if (sd.getStorageDirType().equals(NameNodeDirType.EDITS)) {
+ restoreFile(goodEdits, sd.getCurrentDir(), NameNodeFile.EDITS.getName());
+ } else if (sd.getStorageDirType().equals(NameNodeDirType.IMAGE)) {
+ restoreFile(goodImage, sd.getCurrentDir(), NameNodeFile.IMAGE.getName());
+ } else if (sd.getStorageDirType().equals(
+ NameNodeDirType.IMAGE_AND_EDITS)) {
+ restoreFile(goodEdits, sd.getCurrentDir(), NameNodeFile.EDITS.getName());
+ restoreFile(goodImage, sd.getCurrentDir(), NameNodeFile.IMAGE.getName());
+ } else {
+ throw new IOException("Invalid NameNodeDirType: "
+ + sd.getStorageDirType());
+ }
+
+ //remove from removedStorageDirs and add back to healthy.
+ i.remove();
+ addStorageDir(new StorageDirectory(sd.getRoot(), sd.getStorageDirType()));
+ } catch (IOException e) {
+ FSNamesystem.LOG.warn("Failed to recover removed directory "
+ + sd.getRoot() + " with " + e);
+ exceptions.add(e);
+ }
+ }
+
+ if (!exceptions.isEmpty()) {
+ throw MultipleIOException.createIOException(exceptions);
+ }
+ }
+
+
/** Create new dfs name directory. Caution: this destroys all files
* in this filesystem. */
void format(StorageDirectory sd) throws IOException {
@@ -1433,8 +1532,8 @@ public class FSImage extends Storage {
if (!editLog.existsNew()) {
throw new IOException("New Edits file does not exist");
}
- for (Iterator<StorageDirectory> it =
- dirIterator(NameNodeDirType.IMAGE); it.hasNext();) {
+ Iterator<StorageDirectory> it = dirIterator(NameNodeDirType.IMAGE);
+ while (it.hasNext()) {
StorageDirectory sd = it.next();
File ckpt = getImageFile(sd, NameNodeFile.IMAGE_NEW);
if (!ckpt.exists()) {
@@ -1447,8 +1546,8 @@ public class FSImage extends Storage {
//
// Renames new image
//
- for (Iterator<StorageDirectory> it =
- dirIterator(NameNodeDirType.IMAGE); it.hasNext();) {
+ it = dirIterator(NameNodeDirType.IMAGE);
+ while (it.hasNext()) {
StorageDirectory sd = it.next();
File ckpt = getImageFile(sd, NameNodeFile.IMAGE_NEW);
File curFile = getImageFile(sd, NameNodeFile.IMAGE);
@@ -1457,15 +1556,13 @@ public class FSImage extends Storage {
if (!ckpt.renameTo(curFile)) {
curFile.delete();
if (!ckpt.renameTo(curFile)) {
- // Close edit stream, if this directory is also used for edits
- if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS))
- editLog.processIOError(sd);
- // add storage to the removed list
- removedStorageDirs.add(sd);
+ editLog.removeEditsForStorageDir(sd);
+ updateRemovedDirs(sd, null);
it.remove();
}
}
}
+ editLog.exitIfNoStreams();
//
// Updates the fstime file on all directories (fsimage and edits)
@@ -1473,8 +1570,8 @@ public class FSImage extends Storage {
//
this.layoutVersion = FSConstants.LAYOUT_VERSION;
this.checkpointTime = FSNamesystem.now();
- for (Iterator<StorageDirectory> it =
- dirIterator(); it.hasNext();) {
+ it = dirIterator();
+ while (it.hasNext()) {
StorageDirectory sd = it.next();
// delete old edits if sd is the image only the directory
if (!sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) {
@@ -1488,13 +1585,9 @@ public class FSImage extends Storage {
}
try {
sd.write();
- } catch (IOException e) {
- LOG.error("Cannot write file " + sd.getRoot(), e);
- // Close edit stream, if this directory is also used for edits
- if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS))
- editLog.processIOError(sd);
- //add storage to the removed list
- removedStorageDirs.add(sd);
+ } catch (IOException ioe) {
+ editLog.removeEditsForStorageDir(sd);
+ updateRemovedDirs(sd, ioe);
it.remove();
}
}
Modified: hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java?rev=1300791&r1=1300790&r2=1300791&view=diff
==============================================================================
--- hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java (original)
+++ hadoop/common/branches/branch-1.0/src/hdfs/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java Wed Mar 14 23:54:10 2012
@@ -426,7 +426,7 @@ public class SecondaryNameNode implement
namenode.rollFsImage();
checkpointImage.endCheckpoint();
- LOG.warn("Checkpoint done. New Image Size: "
+ LOG.info("Checkpoint done. New Image Size: "
+ checkpointImage.getFsImageName().length());
}