You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ka...@apache.org on 2015/03/25 15:54:55 UTC

[25/51] [abbrv] hadoop git commit: HDFS-7917. Use file to replace data dirs in test to simulate a disk failure. Contributed by Lei (Eddy) Xu.

HDFS-7917. Use file to replace data dirs in test to simulate a disk failure. Contributed by Lei (Eddy) Xu.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/2c238ae4
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/2c238ae4
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/2c238ae4

Branch: refs/heads/YARN-2139
Commit: 2c238ae4e00371ef76582b007bb0e20ac8455d9c
Parents: 972f1f1
Author: cnauroth <cn...@apache.org>
Authored: Mon Mar 23 16:29:51 2015 -0700
Committer: cnauroth <cn...@apache.org>
Committed: Mon Mar 23 16:29:51 2015 -0700

----------------------------------------------------------------------
 hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt     |  3 +
 .../hdfs/server/datanode/DataNodeTestUtils.java | 61 +++++++++++++++++++-
 .../datanode/TestDataNodeHotSwapVolumes.java    | 29 ++++------
 .../datanode/TestDataNodeVolumeFailure.java     | 11 +---
 .../TestDataNodeVolumeFailureReporting.java     | 46 ++++-----------
 .../TestDataNodeVolumeFailureToleration.java    |  8 +--
 6 files changed, 88 insertions(+), 70 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/2c238ae4/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
index 8c99876..b88b7e3 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@@ -774,6 +774,9 @@ Release 2.7.0 - UNRELEASED
 
     HDFS-7962. Remove duplicated logs in BlockManager. (yliu)
 
+    HDFS-7917. Use file to replace data dirs in test to simulate a disk failure.
+    (Lei (Eddy) Xu via cnauroth)
+
   OPTIMIZATIONS
 
     HDFS-7454. Reduce memory footprint for AclEntries in NameNode.

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2c238ae4/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeTestUtils.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeTestUtils.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeTestUtils.java
index fd51e52..f9a2ba1 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeTestUtils.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeTestUtils.java
@@ -40,7 +40,9 @@ import com.google.common.base.Preconditions;
  * Utility class for accessing package-private DataNode information during tests.
  *
  */
-public class DataNodeTestUtils {  
+public class DataNodeTestUtils {
+  private static final String DIR_FAILURE_SUFFIX = ".origin";
+
   public static DatanodeRegistration 
   getDNRegistrationForBP(DataNode dn, String bpid) throws IOException {
     return dn.getDNRegistrationForBP(bpid);
@@ -159,4 +161,61 @@ public class DataNodeTestUtils {
       final String bpid, final long blkId) {
     return FsDatasetTestUtil.fetchReplicaInfo(dn.getFSDataset(), bpid, blkId);
   }
+
+  /**
+   * It injects disk failures to data dirs by replacing these data dirs with
+   * regular files.
+   *
+   * @param dirs data directories.
+   * @throws IOException on I/O error.
+   */
+  public static void injectDataDirFailure(File... dirs) throws IOException {
+    for (File dir : dirs) {
+      File renamedTo = new File(dir.getPath() + DIR_FAILURE_SUFFIX);
+      if (renamedTo.exists()) {
+        throw new IOException(String.format(
+            "Can not inject failure to dir: %s because %s exists.",
+            dir, renamedTo));
+      }
+      if (!dir.renameTo(renamedTo)) {
+        throw new IOException(String.format("Failed to rename %s to %s.",
+            dir, renamedTo));
+      }
+      if (!dir.createNewFile()) {
+        throw new IOException(String.format(
+            "Failed to create file %s to inject disk failure.", dir));
+      }
+    }
+  }
+
+  /**
+   * Restore the injected data dir failures.
+   *
+   * @see {@link #injectDataDirFailures}.
+   * @param dirs data directories.
+   * @throws IOException
+   */
+  public static void restoreDataDirFromFailure(File... dirs)
+      throws IOException {
+    for (File dir : dirs) {
+      File renamedDir = new File(dir.getPath() + DIR_FAILURE_SUFFIX);
+      if (renamedDir.exists()) {
+        if (dir.exists()) {
+          if (!dir.isFile()) {
+            throw new IOException(
+                "Injected failure data dir is supposed to be file: " + dir);
+          }
+          if (!dir.delete()) {
+            throw new IOException(
+                "Failed to delete injected failure data dir: " + dir);
+          }
+        }
+        if (!renamedDir.renameTo(dir)) {
+          throw new IOException(String.format(
+              "Failed to recover injected failure data dir %s to %s.",
+              renamedDir, dir));
+        }
+      }
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2c238ae4/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeHotSwapVolumes.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeHotSwapVolumes.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeHotSwapVolumes.java
index 8ab3dd2..2f51d45 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeHotSwapVolumes.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeHotSwapVolumes.java
@@ -26,7 +26,6 @@ import org.apache.hadoop.fs.BlockLocation;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdfs.BlockMissingException;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
@@ -682,26 +681,18 @@ public class TestDataNodeHotSwapVolumes {
         failedVolume != null);
     long used = failedVolume.getDfsUsed();
 
-    try {
-      assertTrue("Couldn't chmod local vol: " + dirToFail,
-          FileUtil.setExecutable(dirToFail, false));
-      // Call and wait DataNode to detect disk failure.
-      long lastDiskErrorCheck = dn.getLastDiskErrorCheck();
-      dn.checkDiskErrorAsync();
-      while (dn.getLastDiskErrorCheck() == lastDiskErrorCheck) {
-        Thread.sleep(100);
-      }
-
-      createFile(new Path("/test1"), 32, (short)2);
-      assertEquals(used, failedVolume.getDfsUsed());
-    } finally {
-      // Need to restore the mode on dirToFail. Otherwise, if an Exception
-      // is thrown above, the following tests can not delete this data directory
-      // and thus fail to start MiniDFSCluster.
-      assertTrue("Couldn't restore executable for: " + dirToFail,
-          FileUtil.setExecutable(dirToFail, true));
+    DataNodeTestUtils.injectDataDirFailure(dirToFail);
+    // Call and wait DataNode to detect disk failure.
+    long lastDiskErrorCheck = dn.getLastDiskErrorCheck();
+    dn.checkDiskErrorAsync();
+    while (dn.getLastDiskErrorCheck() == lastDiskErrorCheck) {
+      Thread.sleep(100);
     }
 
+    createFile(new Path("/test1"), 32, (short)2);
+    assertEquals(used, failedVolume.getDfsUsed());
+
+    DataNodeTestUtils.restoreDataDirFromFailure(dirToFail);
     dn.reconfigurePropertyImpl(DFS_DATANODE_DATA_DIR_KEY, oldDataDir);
 
     createFile(new Path("/test2"), 32, (short)2);

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2c238ae4/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailure.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailure.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailure.java
index 9cbad6d..0428b81 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailure.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailure.java
@@ -121,10 +121,6 @@ public class TestDataNodeVolumeFailure {
     if(cluster != null) {
       cluster.shutdown();
     }
-    for (int i = 0; i < 3; i++) {
-      FileUtil.setExecutable(new File(dataDir, "data"+(2*i+1)), true);
-      FileUtil.setExecutable(new File(dataDir, "data"+(2*i+2)), true);
-    }
   }
   
   /*
@@ -159,7 +155,7 @@ public class TestDataNodeVolumeFailure {
         !deteteBlocks(failedDir)
         ) {
       throw new IOException("Could not delete hdfs directory '" + failedDir + "'");
-    }    
+    }
     data_fail.setReadOnly();
     failedDir.setReadOnly();
     System.out.println("Deleteing " + failedDir.getPath() + "; exist=" + failedDir.exists());
@@ -217,7 +213,7 @@ public class TestDataNodeVolumeFailure {
     DFSTestUtil.waitReplication(fs, file1, (short) 2);
 
     File dn0Vol1 = new File(dataDir, "data" + (2 * 0 + 1));
-    assertTrue(FileUtil.setExecutable(dn0Vol1, false));
+    DataNodeTestUtils.injectDataDirFailure(dn0Vol1);
     DataNode dn0 = cluster.getDataNodes().get(0);
     long lastDiskErrorCheck = dn0.getLastDiskErrorCheck();
     dn0.checkDiskErrorAsync();
@@ -291,8 +287,7 @@ public class TestDataNodeVolumeFailure {
     // Fail the first volume on both datanodes
     File dn1Vol1 = new File(dataDir, "data"+(2*0+1));
     File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false));
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
+    DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1);
 
     Path file2 = new Path("/test2");
     DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2c238ae4/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureReporting.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureReporting.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureReporting.java
index 9842f25..aac288a 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureReporting.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureReporting.java
@@ -34,7 +34,6 @@ import org.apache.commons.logging.impl.Log4JLogger;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.ReconfigurationException;
 import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSTestUtil;
@@ -87,19 +86,6 @@ public class TestDataNodeVolumeFailureReporting {
 
   @After
   public void tearDown() throws Exception {
-    // Restore executable permission on all directories where a failure may have
-    // been simulated by denying execute access.  This is based on the maximum
-    // number of datanodes and the maximum number of storages per data node used
-    // throughout the tests in this suite.
-    assumeTrue(!Path.WINDOWS);
-    int maxDataNodes = 3;
-    int maxStoragesPerDataNode = 4;
-    for (int i = 0; i < maxDataNodes; i++) {
-      for (int j = 1; j <= maxStoragesPerDataNode; j++) {
-        String subDir = "data" + ((i * maxStoragesPerDataNode) + j);
-        FileUtil.setExecutable(new File(dataDir, subDir), true);
-      }
-    }
     IOUtils.cleanup(LOG, fs);
     if (cluster != null) {
       cluster.shutdown();
@@ -141,8 +127,7 @@ public class TestDataNodeVolumeFailureReporting {
      * fail. The client does not retry failed nodes even though
      * perhaps they could succeed because just a single volume failed.
      */
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false));
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
+    DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1);
 
     /*
      * Create file1 and wait for 3 replicas (ie all DNs can still
@@ -179,7 +164,7 @@ public class TestDataNodeVolumeFailureReporting {
      * Now fail a volume on the third datanode. We should be able to get
      * three replicas since we've already identified the other failures.
      */
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol1, false));
+    DataNodeTestUtils.injectDataDirFailure(dn3Vol1);
     Path file2 = new Path("/test2");
     DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);
     DFSTestUtil.waitReplication(fs, file2, (short)3);
@@ -208,7 +193,7 @@ public class TestDataNodeVolumeFailureReporting {
      * and that it's no longer up. Only wait for two replicas since
      * we'll never get a third.
      */
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol2, false));
+    DataNodeTestUtils.injectDataDirFailure(dn3Vol2);
     Path file3 = new Path("/test3");
     DFSTestUtil.createFile(fs, file3, 1024, (short)3, 1L);
     DFSTestUtil.waitReplication(fs, file3, (short)2);
@@ -233,10 +218,8 @@ public class TestDataNodeVolumeFailureReporting {
      * restart, so file creation should be able to succeed after
      * restoring the data directories and restarting the datanodes.
      */
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, true));
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true));
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol1, true));
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol2, true));
+    DataNodeTestUtils.restoreDataDirFromFailure(
+        dn1Vol1, dn2Vol1, dn3Vol1, dn3Vol2);
     cluster.restartDataNodes();
     cluster.waitActive();
     Path file4 = new Path("/test4");
@@ -275,8 +258,7 @@ public class TestDataNodeVolumeFailureReporting {
     // third healthy so one node in the pipeline will not fail). 
     File dn1Vol1 = new File(dataDir, "data"+(2*0+1));
     File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false));
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
+    DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1);
 
     Path file1 = new Path("/test1");
     DFSTestUtil.createFile(fs, file1, 1024, (short)2, 1L);
@@ -323,14 +305,7 @@ public class TestDataNodeVolumeFailureReporting {
 
     // Make the first two volume directories on the first two datanodes
     // non-accessible.
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1,
-        false));
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol2,
-        false));
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1,
-        false));
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol2,
-        false));
+    DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn1Vol2, dn2Vol1, dn2Vol2);
 
     // Create file1 and wait for 3 replicas (ie all DNs can still store a block).
     // Then assert that all DNs are up, despite the volume failures.
@@ -380,8 +355,8 @@ public class TestDataNodeVolumeFailureReporting {
     File dn1Vol2 = new File(dataDir, "data"+(2*0+2));
     File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
     File dn2Vol2 = new File(dataDir, "data"+(2*1+2));
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false));
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
+    DataNodeTestUtils.injectDataDirFailure(dn1Vol1);
+    DataNodeTestUtils.injectDataDirFailure(dn2Vol1);
 
     Path file1 = new Path("/test1");
     DFSTestUtil.createFile(fs, file1, 1024, (short)2, 1L);
@@ -449,8 +424,7 @@ public class TestDataNodeVolumeFailureReporting {
 
     // Replace failed volume with healthy volume and run reconfigure DataNode.
     // The failed volume information should be cleared.
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, true));
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true));
+    DataNodeTestUtils.restoreDataDirFromFailure(dn1Vol1, dn2Vol1);
     reconfigureDataNode(dns.get(0), dn1Vol1, dn1Vol2);
     reconfigureDataNode(dns.get(1), dn2Vol1, dn2Vol2);
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2c238ae4/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureToleration.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureToleration.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureToleration.java
index 73dc77c..5b7ac30 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureToleration.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureToleration.java
@@ -76,10 +76,6 @@ public class TestDataNodeVolumeFailureToleration {
 
   @After
   public void tearDown() throws Exception {
-    for (int i = 0; i < 3; i++) {
-      FileUtil.setExecutable(new File(dataDir, "data"+(2*i+1)), true);
-      FileUtil.setExecutable(new File(dataDir, "data"+(2*i+2)), true);
-    }
     cluster.shutdown();
   }
 
@@ -152,7 +148,7 @@ public class TestDataNodeVolumeFailureToleration {
 
     // Fail a volume on the 2nd DN
     File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
+    DataNodeTestUtils.injectDataDirFailure(dn2Vol1);
 
     // Should only get two replicas (the first DN and the 3rd)
     Path file1 = new Path("/test1");
@@ -165,7 +161,7 @@ public class TestDataNodeVolumeFailureToleration {
 
     // If we restore the volume we should still only be able to get
     // two replicas since the DN is still considered dead.
-    assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true));
+    DataNodeTestUtils.restoreDataDirFromFailure(dn2Vol1);
     Path file2 = new Path("/test2");
     DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);
     DFSTestUtil.waitReplication(fs, file2, (short)2);