You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by mb...@apache.org on 2013/01/26 22:59:20 UTC
svn commit: r1438972 - in /hbase/trunk/hbase-server/src:
main/java/org/apache/hadoop/hbase/backup/HFileArchiver.java
test/java/org/apache/hadoop/hbase/backup/TestHFileArchiving.java
Author: mbertozzi
Date: Sat Jan 26 21:59:20 2013
New Revision: 1438972
URL: http://svn.apache.org/viewvc?rev=1438972&view=rev
Log:
HBASE-7643 HFileArchiver.resolveAndArchive() race condition may lead to snapshot data loss
Modified:
hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/backup/HFileArchiver.java
hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestHFileArchiving.java
Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/backup/HFileArchiver.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/backup/HFileArchiver.java?rev=1438972&r1=1438971&r2=1438972&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/backup/HFileArchiver.java (original)
+++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/backup/HFileArchiver.java Sat Jan 26 21:59:20 2013
@@ -55,6 +55,9 @@ public class HFileArchiver {
private static final Log LOG = LogFactory.getLog(HFileArchiver.class);
private static final String SEPARATOR = ".";
+ /** Number of retries in case of fs operation failure */
+ private static final int DEFAULT_RETRIES_NUMBER = 3;
+
private HFileArchiver() {
// hidden ctor since this is just a util
}
@@ -136,6 +139,7 @@ public class HFileArchiver {
try {
success = resolveAndArchive(fs, regionArchiveDir, toArchive);
} catch (IOException e) {
+ LOG.error("Failed to archive: " + toArchive, e);
success = false;
}
@@ -146,7 +150,7 @@ public class HFileArchiver {
}
throw new IOException("Received error when attempting to archive files (" + toArchive
- + "), cannot delete region directory.");
+ + "), cannot delete region directory. ");
}
/**
@@ -254,14 +258,12 @@ public class HFileArchiver {
long start = EnvironmentEdgeManager.currentTimeMillis();
List<File> failures = resolveAndArchive(fs, baseArchiveDir, toArchive, start);
- // clean out the failures by just deleting them
+ // notify that some files were not archived.
+ // We can't delete the files otherwise snapshots or other backup system
+ // that relies on the archiver end up with data loss.
if (failures.size() > 0) {
- try {
- LOG.error("Failed to complete archive, deleting extra store files.");
- deleteFilesWithoutArchiving(failures);
- } catch (IOException e) {
- LOG.warn("Failed to delete store file(s) when archiving failed", e);
- }
+ LOG.warn("Failed to complete archive of: " + failures +
+ ". Those files are still in the original location, and they may slow down reads.");
return false;
}
return true;
@@ -364,50 +366,51 @@ public class HFileArchiver {
if (!fs.delete(archiveFile, false)) {
throw new IOException("Couldn't delete existing archive file (" + archiveFile
+ ") or rename it to the backup file (" + backedupArchiveFile
- + ")to make room for similarly named file.");
+ + ") to make room for similarly named file.");
}
}
LOG.debug("Backed up archive file from: " + archiveFile);
}
- LOG.debug("No existing file in archive for:" + archiveFile + ", free to archive original file.");
+ LOG.debug("No existing file in archive for:" + archiveFile +
+ ", free to archive original file.");
// at this point, we should have a free spot for the archive file
- if (currentFile.moveAndClose(archiveFile)) {
- LOG.error("Failed to archive file:" + currentFile);
- return false;
- } else if (LOG.isDebugEnabled()) {
- LOG.debug("Finished archiving file from: " + currentFile + ", to: " + archiveFile);
- }
- return true;
- }
+ boolean success = false;
+ for (int i = 0; !success && i < DEFAULT_RETRIES_NUMBER; ++i) {
+ if (i > 0) {
+ // Ensure that the archive directory exists.
+ // The previous "move to archive" operation has failed probably because
+ // the cleaner has removed our archive directory (HBASE-7643).
+ // (we're in a retry loop, so don't worry too much about the exception)
+ try {
+ if (!fs.exists(archiveDir)) {
+ if (fs.mkdirs(archiveDir)) {
+ LOG.debug("Created archive directory:" + archiveDir);
+ }
+ }
+ } catch (IOException e) {
+ LOG.warn("Failed to create the archive directory: " + archiveDir, e);
+ }
+ }
- /**
- * Simple delete of regular files from the {@link FileSystem}.
- * <p>
- * This method is a more generic implementation that the other deleteXXX
- * methods in this class, allowing more code reuse at the cost of a couple
- * more, short-lived objects (which should have minimum impact on the jvm).
- * @param fs {@link FileSystem} where the files live
- * @param files {@link Collection} of files to be deleted
- * @throws IOException if a file cannot be deleted. All files will be
- * attempted to deleted before throwing the exception, rather than
- * failing at the first file.
- */
- private static void deleteFilesWithoutArchiving(Collection<File> files) throws IOException {
- List<IOException> errors = new ArrayList<IOException>(0);
- for (File file : files) {
try {
- LOG.debug("Deleting region file:" + file);
- file.delete();
+ success = currentFile.moveAndClose(archiveFile);
} catch (IOException e) {
- LOG.error("Failed to delete file:" + file);
- errors.add(e);
+ LOG.warn("Failed to archive file: " + currentFile + " on try #" + i, e);
+ success = false;
}
}
- if (errors.size() > 0) {
- throw MultipleIOException.createIOException(errors);
+
+ if (!success) {
+ LOG.error("Failed to archive file:" + currentFile);
+ return false;
}
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Finished archiving file from: " + currentFile + ", to: " + archiveFile);
+ }
+ return true;
}
/**
@@ -553,7 +556,7 @@ public class HFileArchiver {
public boolean moveAndClose(Path dest) throws IOException {
this.close();
Path p = this.getPath();
- return !fs.rename(p, dest);
+ return fs.rename(p, dest);
}
/**
Modified: hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestHFileArchiving.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestHFileArchiving.java?rev=1438972&r1=1438971&r2=1438972&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestHFileArchiving.java (original)
+++ hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestHFileArchiving.java Sat Jan 26 21:59:20 2013
@@ -36,7 +36,11 @@ import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.MediumTests;
+import org.apache.hadoop.hbase.Stoppable;
+import org.apache.hadoop.hbase.backup.HFileArchiver;
import org.apache.hadoop.hbase.client.HBaseAdmin;
+import org.apache.hadoop.hbase.master.MasterFileSystem;
+import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
import org.apache.hadoop.hbase.regionserver.ConstantSizeRegionSplitPolicy;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
@@ -44,6 +48,7 @@ import org.apache.hadoop.hbase.util.Byte
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.HFileArchiveTestingUtil;
import org.apache.hadoop.hbase.util.HFileArchiveUtil;
+import org.apache.hadoop.hbase.util.StoppableImplementation;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Assert;
@@ -319,6 +324,69 @@ public class TestHFileArchiving {
archivedFiles.containsAll(storeFiles));
}
+ /**
+ * Test HFileArchiver.resolveAndArchive() race condition HBASE-7643
+ */
+ @Test
+ public void testCleaningRace() throws Exception {
+ final long TEST_TIME = 20 * 1000;
+
+ Configuration conf = UTIL.getMiniHBaseCluster().getMaster().getConfiguration();
+ Path rootDir = UTIL.getDataTestDir("testCleaningRace");
+ FileSystem fs = UTIL.getTestFileSystem();
+
+ Path archiveDir = new Path(rootDir, HConstants.HFILE_ARCHIVE_DIRECTORY);
+ Path regionDir = new Path("table", "abcdef");
+ Path familyDir = new Path(regionDir, "cf");
+
+ Path sourceRegionDir = new Path(rootDir, regionDir);
+ fs.mkdirs(sourceRegionDir);
+
+ Stoppable stoppable = new StoppableImplementation();
+
+ // The cleaner should be looping without long pauses to reproduce the race condition.
+ HFileCleaner cleaner = new HFileCleaner(1, stoppable, conf, fs, archiveDir);
+ try {
+ cleaner.start();
+
+ // Keep creating/archiving new files while the cleaner is running in the other thread
+ long startTime = System.currentTimeMillis();
+ for (long fid = 0; (System.currentTimeMillis() - startTime) < TEST_TIME; ++fid) {
+ Path file = new Path(familyDir, String.valueOf(fid));
+ Path sourceFile = new Path(rootDir, file);
+ Path archiveFile = new Path(archiveDir, file);
+
+ fs.createNewFile(sourceFile);
+
+ try {
+ // Try to archive the file
+ HFileArchiver.archiveRegion(conf, fs, rootDir,
+ sourceRegionDir.getParent(), sourceRegionDir);
+
+ // The archiver succeded, the file is no longer in the original location
+ // but it's in the archive location.
+ LOG.debug("hfile=" + fid + " should be in the archive");
+ assertTrue(fs.exists(archiveFile));
+ assertFalse(fs.exists(sourceFile));
+ } catch (IOException e) {
+ // The archiver is unable to archive the file. Probably HBASE-7643 race condition.
+ // in this case, the file should not be archived, and we should have the file
+ // in the original location.
+ LOG.debug("hfile=" + fid + " should be in the source location");
+ assertFalse(fs.exists(archiveFile));
+ assertTrue(fs.exists(sourceFile));
+
+ // Avoid to have this file in the next run
+ fs.delete(sourceFile, false);
+ }
+ }
+ } finally {
+ stoppable.stop("test end");
+ cleaner.join();
+ fs.delete(rootDir, true);
+ }
+ }
+
private void clearArchiveDirectory() throws IOException {
UTIL.getTestFileSystem().delete(new Path(UTIL.getDefaultRootDirPath(), ".archive"), true);
}