You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by mb...@apache.org on 2013/01/26 22:59:20 UTC

svn commit: r1438972 - in /hbase/trunk/hbase-server/src: main/java/org/apache/hadoop/hbase/backup/HFileArchiver.java test/java/org/apache/hadoop/hbase/backup/TestHFileArchiving.java

Author: mbertozzi
Date: Sat Jan 26 21:59:20 2013
New Revision: 1438972

URL: http://svn.apache.org/viewvc?rev=1438972&view=rev
Log:
HBASE-7643 HFileArchiver.resolveAndArchive() race condition may lead to snapshot data loss

Modified:
    hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/backup/HFileArchiver.java
    hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestHFileArchiving.java

Modified: hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/backup/HFileArchiver.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/backup/HFileArchiver.java?rev=1438972&r1=1438971&r2=1438972&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/backup/HFileArchiver.java (original)
+++ hbase/trunk/hbase-server/src/main/java/org/apache/hadoop/hbase/backup/HFileArchiver.java Sat Jan 26 21:59:20 2013
@@ -55,6 +55,9 @@ public class HFileArchiver {
   private static final Log LOG = LogFactory.getLog(HFileArchiver.class);
   private static final String SEPARATOR = ".";
 
+  /** Number of retries in case of fs operation failure */
+  private static final int DEFAULT_RETRIES_NUMBER = 3;
+
   private HFileArchiver() {
     // hidden ctor since this is just a util
   }
@@ -136,6 +139,7 @@ public class HFileArchiver {
     try {
       success = resolveAndArchive(fs, regionArchiveDir, toArchive);
     } catch (IOException e) {
+      LOG.error("Failed to archive: " + toArchive, e);
       success = false;
     }
 
@@ -146,7 +150,7 @@ public class HFileArchiver {
     }
 
     throw new IOException("Received error when attempting to archive files (" + toArchive
-        + "), cannot delete region directory.");
+        + "), cannot delete region directory. ");
   }
 
   /**
@@ -254,14 +258,12 @@ public class HFileArchiver {
     long start = EnvironmentEdgeManager.currentTimeMillis();
     List<File> failures = resolveAndArchive(fs, baseArchiveDir, toArchive, start);
 
-    // clean out the failures by just deleting them
+    // notify that some files were not archived.
+    // We can't delete the files otherwise snapshots or other backup system
+    // that relies on the archiver end up with data loss.
     if (failures.size() > 0) {
-      try {
-        LOG.error("Failed to complete archive, deleting extra store files.");
-        deleteFilesWithoutArchiving(failures);
-      } catch (IOException e) {
-        LOG.warn("Failed to delete store file(s) when archiving failed", e);
-      }
+      LOG.warn("Failed to complete archive of: " + failures +
+        ". Those files are still in the original location, and they may slow down reads.");
       return false;
     }
     return true;
@@ -364,50 +366,51 @@ public class HFileArchiver {
         if (!fs.delete(archiveFile, false)) {
           throw new IOException("Couldn't delete existing archive file (" + archiveFile
               + ") or rename it to the backup file (" + backedupArchiveFile
-              + ")to make room for similarly named file.");
+              + ") to make room for similarly named file.");
         }
       }
       LOG.debug("Backed up archive file from: " + archiveFile);
     }
 
-    LOG.debug("No existing file in archive for:" + archiveFile + ", free to archive original file.");
+    LOG.debug("No existing file in archive for:" + archiveFile +
+        ", free to archive original file.");
 
     // at this point, we should have a free spot for the archive file
-    if (currentFile.moveAndClose(archiveFile)) {
-      LOG.error("Failed to archive file:" + currentFile);
-      return false;
-    } else if (LOG.isDebugEnabled()) {
-      LOG.debug("Finished archiving file from: " + currentFile + ", to: " + archiveFile);
-    }
-    return true;
-  }
+    boolean success = false;
+    for (int i = 0; !success && i < DEFAULT_RETRIES_NUMBER; ++i) {
+      if (i > 0) {
+        // Ensure that the archive directory exists.
+        // The previous "move to archive" operation has failed probably because
+        // the cleaner has removed our archive directory (HBASE-7643).
+        // (we're in a retry loop, so don't worry too much about the exception)
+        try {
+          if (!fs.exists(archiveDir)) {
+            if (fs.mkdirs(archiveDir)) {
+              LOG.debug("Created archive directory:" + archiveDir);
+            }
+          }
+        } catch (IOException e) {
+          LOG.warn("Failed to create the archive directory: " + archiveDir, e);
+        }
+      }
 
-  /**
-   * Simple delete of regular files from the {@link FileSystem}.
-   * <p>
-   * This method is a more generic implementation that the other deleteXXX
-   * methods in this class, allowing more code reuse at the cost of a couple
-   * more, short-lived objects (which should have minimum impact on the jvm).
-   * @param fs {@link FileSystem} where the files live
-   * @param files {@link Collection} of files to be deleted
-   * @throws IOException if a file cannot be deleted. All files will be
-   *           attempted to deleted before throwing the exception, rather than
-   *           failing at the first file.
-   */
-  private static void deleteFilesWithoutArchiving(Collection<File> files) throws IOException {
-    List<IOException> errors = new ArrayList<IOException>(0);
-    for (File file : files) {
       try {
-        LOG.debug("Deleting region file:" + file);
-        file.delete();
+        success = currentFile.moveAndClose(archiveFile);
       } catch (IOException e) {
-        LOG.error("Failed to delete file:" + file);
-        errors.add(e);
+        LOG.warn("Failed to archive file: " + currentFile + " on try #" + i, e);
+        success = false;
       }
     }
-    if (errors.size() > 0) {
-      throw MultipleIOException.createIOException(errors);
+
+    if (!success) {
+      LOG.error("Failed to archive file:" + currentFile);
+      return false;
     }
+
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Finished archiving file from: " + currentFile + ", to: " + archiveFile);
+    }
+    return true;
   }
 
   /**
@@ -553,7 +556,7 @@ public class HFileArchiver {
     public boolean moveAndClose(Path dest) throws IOException {
       this.close();
       Path p = this.getPath();
-      return !fs.rename(p, dest);
+      return fs.rename(p, dest);
     }
 
     /**

Modified: hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestHFileArchiving.java
URL: http://svn.apache.org/viewvc/hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestHFileArchiving.java?rev=1438972&r1=1438971&r2=1438972&view=diff
==============================================================================
--- hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestHFileArchiving.java (original)
+++ hbase/trunk/hbase-server/src/test/java/org/apache/hadoop/hbase/backup/TestHFileArchiving.java Sat Jan 26 21:59:20 2013
@@ -36,7 +36,11 @@ import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.hbase.HBaseTestingUtility;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.MediumTests;
+import org.apache.hadoop.hbase.Stoppable;
+import org.apache.hadoop.hbase.backup.HFileArchiver;
 import org.apache.hadoop.hbase.client.HBaseAdmin;
+import org.apache.hadoop.hbase.master.MasterFileSystem;
+import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
 import org.apache.hadoop.hbase.regionserver.ConstantSizeRegionSplitPolicy;
 import org.apache.hadoop.hbase.regionserver.HRegion;
 import org.apache.hadoop.hbase.regionserver.HRegionServer;
@@ -44,6 +48,7 @@ import org.apache.hadoop.hbase.util.Byte
 import org.apache.hadoop.hbase.util.FSUtils;
 import org.apache.hadoop.hbase.util.HFileArchiveTestingUtil;
 import org.apache.hadoop.hbase.util.HFileArchiveUtil;
+import org.apache.hadoop.hbase.util.StoppableImplementation;
 import org.junit.After;
 import org.junit.AfterClass;
 import org.junit.Assert;
@@ -319,6 +324,69 @@ public class TestHFileArchiving {
       archivedFiles.containsAll(storeFiles));
   }
 
+  /**
+   * Test HFileArchiver.resolveAndArchive() race condition HBASE-7643
+   */
+  @Test
+  public void testCleaningRace() throws Exception {
+    final long TEST_TIME = 20 * 1000;
+
+    Configuration conf = UTIL.getMiniHBaseCluster().getMaster().getConfiguration();
+    Path rootDir = UTIL.getDataTestDir("testCleaningRace");
+    FileSystem fs = UTIL.getTestFileSystem();
+
+    Path archiveDir = new Path(rootDir, HConstants.HFILE_ARCHIVE_DIRECTORY);
+    Path regionDir = new Path("table", "abcdef");
+    Path familyDir = new Path(regionDir, "cf");
+
+    Path sourceRegionDir = new Path(rootDir, regionDir);
+    fs.mkdirs(sourceRegionDir);
+
+    Stoppable stoppable = new StoppableImplementation();
+
+    // The cleaner should be looping without long pauses to reproduce the race condition.
+    HFileCleaner cleaner = new HFileCleaner(1, stoppable, conf, fs, archiveDir);
+    try {
+      cleaner.start();
+
+      // Keep creating/archiving new files while the cleaner is running in the other thread
+      long startTime = System.currentTimeMillis();
+      for (long fid = 0; (System.currentTimeMillis() - startTime) < TEST_TIME; ++fid) {
+        Path file = new Path(familyDir,  String.valueOf(fid));
+        Path sourceFile = new Path(rootDir, file);
+        Path archiveFile = new Path(archiveDir, file);
+
+        fs.createNewFile(sourceFile);
+
+        try {
+          // Try to archive the file
+          HFileArchiver.archiveRegion(conf, fs, rootDir,
+              sourceRegionDir.getParent(), sourceRegionDir);
+
+          // The archiver succeded, the file is no longer in the original location
+          // but it's in the archive location.
+          LOG.debug("hfile=" + fid + " should be in the archive");
+          assertTrue(fs.exists(archiveFile));
+          assertFalse(fs.exists(sourceFile));
+        } catch (IOException e) {
+          // The archiver is unable to archive the file. Probably HBASE-7643 race condition.
+          // in this case, the file should not be archived, and we should have the file
+          // in the original location.
+          LOG.debug("hfile=" + fid + " should be in the source location");
+          assertFalse(fs.exists(archiveFile));
+          assertTrue(fs.exists(sourceFile));
+
+          // Avoid to have this file in the next run
+          fs.delete(sourceFile, false);
+        }
+      }
+    } finally {
+      stoppable.stop("test end");
+      cleaner.join();
+      fs.delete(rootDir, true);
+    }
+  }
+
   private void clearArchiveDirectory() throws IOException {
     UTIL.getTestFileSystem().delete(new Path(UTIL.getDefaultRootDirPath(), ".archive"), true);
   }