You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by vi...@apache.org on 2018/01/10 13:19:42 UTC

hadoop git commit: HADOOP-13738. DiskChecker should perform some disk IO.

Repository: hadoop
Updated Branches:
  refs/heads/branch-2.8 804696ee9 -> 7d5c00723


HADOOP-13738. DiskChecker should perform some disk IO.

(cherry picked from commit 148bb3e737291ccf9597de4b0c499d8016e59740)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/7d5c0072
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/7d5c0072
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/7d5c0072

Branch: refs/heads/branch-2.8
Commit: 7d5c00723cbb5266cbf40489da9b17b65aaaa1fb
Parents: 804696e
Author: Arpit Agarwal <ar...@apache.org>
Authored: Tue Nov 1 18:09:12 2016 -0700
Committer: Vinayakumar B <vi...@apache.org>
Committed: Wed Jan 10 18:45:28 2018 +0530

----------------------------------------------------------------------
 .../org/apache/hadoop/util/DiskChecker.java     | 159 ++++++++++++++++-
 .../org/apache/hadoop/util/TestDiskChecker.java | 178 ++++++++++++++++---
 2 files changed, 309 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/7d5c0072/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DiskChecker.java
----------------------------------------------------------------------
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DiskChecker.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DiskChecker.java
index 2c73af8..8563232 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DiskChecker.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DiskChecker.java
@@ -19,14 +19,23 @@
 package org.apache.hadoop.util;
 
 import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.util.UUID;
+import java.util.concurrent.atomic.AtomicReference;
 
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.LocalFileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Class that provides utility functions for checking disk problem
@@ -34,6 +43,8 @@ import org.apache.hadoop.fs.permission.FsPermission;
 @InterfaceAudience.Private
 @InterfaceStability.Unstable
 public class DiskChecker {
+  public static final Logger LOG = LoggerFactory.getLogger(DiskChecker.class);
+
   public static class DiskErrorException extends IOException {
     public DiskErrorException(String msg) {
       super(msg);
@@ -49,7 +60,12 @@ public class DiskChecker {
       super(msg);
     }
   }
-      
+
+  // Provider that abstracts some FileOutputStream operations for
+  // testability.
+  private static AtomicReference<FileIoProvider> fileIoProvider =
+      new AtomicReference<FileIoProvider>(new DefaultFileIoProvider());
+
   /**
    * Create the directory if it doesn't exist and check that dir is readable,
    * writable and executable
@@ -63,6 +79,7 @@ public class DiskChecker {
                                    + dir.toString());
     }
     checkAccessByFileMethods(dir);
+    doDiskIo(dir);
   }
 
   /**
@@ -80,6 +97,7 @@ public class DiskChecker {
   throws DiskErrorException, IOException {
     mkdirsWithExistsAndPermissionCheck(localFS, dir, expected);
     checkAccessByFileMethods(localFS.pathToFile(dir));
+    doDiskIo(localFS.pathToFile(dir));
   }
 
   /**
@@ -173,4 +191,143 @@ public class DiskChecker {
     if (created || !localFS.getFileStatus(dir).getPermission().equals(expected))
       localFS.setPermission(dir, expected);
   }
+
+  // State related to running disk IO checks.
+  private static final String DISK_IO_FILE_PREFIX =
+      "DiskChecker.OK_TO_DELETE_.";
+
+  @VisibleForTesting
+  static final int DISK_IO_MAX_ITERATIONS = 3;
+
+  /**
+   * Performs some disk IO by writing to a new file in the given directory
+   * and sync'ing file contents to disk.
+   *
+   * This increases the likelihood of catching catastrophic disk/controller
+   * failures sooner.
+   *
+   * @param dir directory to be checked.
+   * @throws DiskErrorException if we hit an error while trying to perform
+   *         disk IO against the file.
+   */
+  private static void doDiskIo(File dir) throws DiskErrorException {
+    try {
+      IOException ioe = null;
+
+      for (int i = 0; i < DISK_IO_MAX_ITERATIONS; ++i) {
+        final File file = getFileNameForDiskIoCheck(dir, i+1);
+        try {
+          diskIoCheckWithoutNativeIo(file);
+          return;
+        } catch (IOException e) {
+          // Let's retry a few times before we really give up and
+          // declare the disk as bad.
+          ioe = e;
+        }
+      }
+      throw ioe;  // Just rethrow the last exception to signal failure.
+    } catch(IOException e) {
+      throw new DiskErrorException("Error checking directory " + dir, e);
+    }
+  }
+
+  /**
+   * Try to perform some disk IO by writing to the given file
+   * without using Native IO.
+   *
+   * @param file
+   * @throws IOException if there was a non-retriable error.
+   */
+  private static void diskIoCheckWithoutNativeIo(File file)
+      throws IOException {
+    FileOutputStream fos = null;
+
+    try {
+      final FileIoProvider provider = fileIoProvider.get();
+      fos = provider.get(file);
+      provider.write(fos, new byte[1]);
+      fos.getFD().sync();
+      fos.close();
+      fos = null;
+      if (!file.delete() && file.exists()) {
+        throw new IOException("Failed to delete " + file);
+      }
+      file = null;
+    } finally {
+      IOUtils.cleanup(null, fos);
+      FileUtils.deleteQuietly(file);
+    }
+  }
+
+  /**
+   * Generate a path name for a test file under the given directory.
+   *
+   * @return file object.
+   */
+  @VisibleForTesting
+  static File getFileNameForDiskIoCheck(File dir, int iterationCount) {
+    if (iterationCount < DISK_IO_MAX_ITERATIONS) {
+      // Use file names of the format prefix.001 by default.
+      return new File(dir,
+          DISK_IO_FILE_PREFIX + String.format("%03d", iterationCount));
+    } else {
+      // If the first few checks then fail, try using a randomly generated
+      // file name.
+      return new File(dir, DISK_IO_FILE_PREFIX + UUID.randomUUID());
+    }
+  }
+
+  /**
+   * An interface that abstracts operations on {@link FileOutputStream}
+   * objects for testability.
+   */
+  interface FileIoProvider {
+    FileOutputStream get(File f) throws FileNotFoundException;
+    void write(FileOutputStream fos, byte[] data) throws IOException;
+  }
+
+  /**
+   * The default implementation of {@link FileIoProvider}.
+   */
+  private static class DefaultFileIoProvider implements FileIoProvider {
+    /**
+     * See {@link FileOutputStream#FileOutputStream(File)}.
+     */
+    @Override
+    public FileOutputStream get(File f) throws FileNotFoundException {
+      return new FileOutputStream(f);
+    }
+
+    /**
+     * See {@link FileOutputStream#write(byte[])}.
+     */
+    @Override
+    public void write(FileOutputStream fos, byte[] data) throws IOException {
+      fos.write(data);
+    }
+  }
+
+  /**
+   * Replace the {@link FileIoProvider} for tests.
+   * This method MUST NOT be used outside of unit tests.
+   *
+   * @param newFosProvider
+   * @return the old FileIoProvider.
+   */
+  @VisibleForTesting
+  static FileIoProvider replaceFileOutputStreamProvider(
+      FileIoProvider newFosProvider) {
+    return fileIoProvider.getAndSet(newFosProvider);
+  }
+
+  /**
+   * Retrieve the current {@link FileIoProvider}.
+   * This method MUST NOT be used outside of unit tests.
+   *
+   * @return the current FileIoProvider.
+   */
+  @VisibleForTesting
+  static FileIoProvider getFileOutputStreamProvider() {
+    return fileIoProvider.get();
+  }
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/7d5c0072/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestDiskChecker.java
----------------------------------------------------------------------
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestDiskChecker.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestDiskChecker.java
index 5ab1313..43bd183 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestDiskChecker.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestDiskChecker.java
@@ -18,7 +18,11 @@
 package org.apache.hadoop.util;
 
 import java.io.*;
+import java.util.concurrent.atomic.AtomicInteger;
 
+import org.apache.hadoop.util.DiskChecker.FileIoProvider;
+import org.junit.After;
+import org.junit.Before;
 import org.junit.Test;
 import static org.junit.Assert.*;
 
@@ -34,27 +38,46 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.util.DiskChecker.DiskErrorException;
 import org.apache.hadoop.util.Shell;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public class TestDiskChecker {
-  final FsPermission defaultPerm = new FsPermission("755");
-  final FsPermission invalidPerm = new FsPermission("000");
+  public static final Logger LOG =
+      LoggerFactory.getLogger(TestDiskChecker.class);
 
-  @Test (timeout = 30000)
+  private final FsPermission defaultPerm = new FsPermission("755");
+  private final FsPermission invalidPerm = new FsPermission("000");
+
+  private FileIoProvider fileIoProvider = null;
+
+  @Before
+  public void setup() {
+    // Some tests replace the static field DiskChecker#fileIoProvider.
+    // Cache it so we can restore it after each test completes.
+    fileIoProvider = DiskChecker.getFileOutputStreamProvider();
+  }
+
+  @After
+  public void cleanup() {
+    DiskChecker.replaceFileOutputStreamProvider(fileIoProvider);
+  }
+
+  @Test(timeout = 30000)
   public void testMkdirs_dirExists() throws Throwable {
     _mkdirs(true, defaultPerm, defaultPerm);
   }
 
-  @Test (timeout = 30000)
+  @Test(timeout = 30000)
   public void testMkdirs_noDir() throws Throwable {
     _mkdirs(false, defaultPerm, defaultPerm);
   }
 
-  @Test (timeout = 30000)
+  @Test(timeout = 30000)
   public void testMkdirs_dirExists_badUmask() throws Throwable {
     _mkdirs(true, defaultPerm, invalidPerm);
   }
 
-  @Test (timeout = 30000)
+  @Test(timeout = 30000)
   public void testMkdirs_noDir_badUmask() throws Throwable {
     _mkdirs(false, defaultPerm, invalidPerm);
   }
@@ -79,34 +102,33 @@ public class TestDiskChecker {
         verify(fs).getFileStatus(dir);
         verify(stat).getPermission();
       }
-    }
-    catch (DiskErrorException e) {
+    } catch (DiskErrorException e) {
       if (before != after)
         assertTrue(e.getMessage().startsWith("Incorrect permission"));
     }
   }
 
-  @Test (timeout = 30000)
+  @Test(timeout = 30000)
   public void testCheckDir_normal() throws Throwable {
     _checkDirs(true, new FsPermission("755"), true);
   }
 
-  @Test (timeout = 30000)
+  @Test(timeout = 30000)
   public void testCheckDir_notDir() throws Throwable {
     _checkDirs(false, new FsPermission("000"), false);
   }
 
-  @Test (timeout = 30000)
+  @Test(timeout = 30000)
   public void testCheckDir_notReadable() throws Throwable {
     _checkDirs(true, new FsPermission("000"), false);
   }
 
-  @Test (timeout = 30000)
+  @Test(timeout = 30000)
   public void testCheckDir_notWritable() throws Throwable {
     _checkDirs(true, new FsPermission("444"), false);
   }
 
-  @Test (timeout = 30000)
+  @Test(timeout = 30000)
   public void testCheckDir_notListable() throws Throwable {
     _checkDirs(true, new FsPermission("666"), false);   // not listable
   }
@@ -119,13 +141,15 @@ public class TestDiskChecker {
       localDir.mkdir();
     }
     Shell.execCommand(Shell.getSetPermissionCommand(String.format("%04o",
-      perm.toShort()), false, localDir.getAbsolutePath()));
+        perm.toShort()), false, localDir.getAbsolutePath()));
     try {
       DiskChecker.checkDir(FileSystem.getLocal(new Configuration()),
-        new Path(localDir.getAbsolutePath()), perm);
-      assertTrue("checkDir success", success);
+          new Path(localDir.getAbsolutePath()), perm);
+      assertTrue("checkDir success, expected failure", success);
     } catch (DiskErrorException e) {
-      assertFalse("checkDir success", success);
+      if (success) {
+        throw e; // Unexpected exception!
+      }
     }
     localDir.delete();
   }
@@ -135,27 +159,27 @@ public class TestDiskChecker {
    * permission for result of mapper.
    */
 
-  @Test (timeout = 30000)
+  @Test(timeout = 30000)
   public void testCheckDir_normal_local() throws Throwable {
     _checkDirs(true, "755", true);
   }
 
-  @Test (timeout = 30000)
+  @Test(timeout = 30000)
   public void testCheckDir_notDir_local() throws Throwable {
     _checkDirs(false, "000", false);
   }
 
-  @Test (timeout = 30000)
+  @Test(timeout = 30000)
   public void testCheckDir_notReadable_local() throws Throwable {
     _checkDirs(true, "000", false);
   }
 
-  @Test (timeout = 30000)
+  @Test(timeout = 30000)
   public void testCheckDir_notWritable_local() throws Throwable {
     _checkDirs(true, "444", false);
   }
 
-  @Test (timeout = 30000)
+  @Test(timeout = 30000)
   public void testCheckDir_notListable_local() throws Throwable {
     _checkDirs(true, "666", false);
   }
@@ -168,16 +192,116 @@ public class TestDiskChecker {
       localDir.mkdir();
     }
     Shell.execCommand(Shell.getSetPermissionCommand(perm, false,
-                                                    localDir.getAbsolutePath()));
+        localDir.getAbsolutePath()));
     try {
       DiskChecker.checkDir(localDir);
-      assertTrue("checkDir success", success);
+      assertTrue("checkDir success, expected failure", success);
     } catch (DiskErrorException e) {
-      e.printStackTrace();
-      assertFalse("checkDir success", success);
+      if (success) {
+        throw e; // Unexpected exception!
+      }
     }
     localDir.delete();
-    System.out.println("checkDir success: " + success);
+  }
 
+  /**
+   * Verify DiskChecker ignores at least 2 transient file creation errors.
+   */
+  @Test(timeout = 30000)
+  public void testDiskIoIgnoresTransientCreateErrors() throws Throwable {
+    DiskChecker.replaceFileOutputStreamProvider(new TestFileIoProvider(
+        DiskChecker.DISK_IO_MAX_ITERATIONS - 1, 0));
+    _checkDirs(true, "755", true);
+  }
+
+  /**
+   * Verify DiskChecker bails after 3 file creation errors.
+   */
+  @Test(timeout = 30000)
+  public void testDiskIoDetectsCreateErrors() throws Throwable {
+    DiskChecker.replaceFileOutputStreamProvider(new TestFileIoProvider(
+        DiskChecker.DISK_IO_MAX_ITERATIONS, 0));
+    _checkDirs(true, "755", false);
+  }
+
+  /**
+   * Verify DiskChecker ignores at least 2 transient file write errors.
+   */
+  @Test(timeout = 30000)
+  public void testDiskIoIgnoresTransientWriteErrors() throws Throwable {
+    DiskChecker.replaceFileOutputStreamProvider(new TestFileIoProvider(
+        0, DiskChecker.DISK_IO_MAX_ITERATIONS - 1));
+    _checkDirs(true, "755", true);
+  }
+
+  /**
+   * Verify DiskChecker bails after 3 file write errors.
+   */
+  @Test(timeout = 30000)
+  public void testDiskIoDetectsWriteErrors() throws Throwable {
+    DiskChecker.replaceFileOutputStreamProvider(new TestFileIoProvider(
+        0, DiskChecker.DISK_IO_MAX_ITERATIONS));
+    _checkDirs(true, "755", false);
+  }
+
+  /**
+   * Verify DiskChecker's test file naming scheme.
+   */
+  @Test(timeout = 30000)
+  public void testDiskIoFileNaming() throws Throwable {
+    final File rootDir = new File("/");
+    assertTrue(".001".matches("\\.00\\d$"));
+    for (int i = 1; i < DiskChecker.DISK_IO_MAX_ITERATIONS; ++i) {
+      final File file = DiskChecker.getFileNameForDiskIoCheck(rootDir, i);
+      assertTrue(
+          "File name does not match expected pattern: " + file,
+          file.toString().matches("^.*\\.[0-9]+$"));
+    }
+    final File guidFile = DiskChecker.getFileNameForDiskIoCheck(
+        rootDir, DiskChecker.DISK_IO_MAX_ITERATIONS);
+    assertTrue(
+        "File name does not match expected pattern: " + guidFile,
+        guidFile.toString().matches("^.*\\.[A-Za-z0-9-]+$"));
+  }
+
+  /**
+   * A dummy {@link DiskChecker.FileIoProvider} that can throw a programmable
+   * number of times.
+   */
+  private static class TestFileIoProvider implements FileIoProvider {
+    private final AtomicInteger numCreateCalls = new AtomicInteger(0);
+    private final AtomicInteger numWriteCalls = new AtomicInteger(0);
+
+    private final int numTimesToThrowOnCreate;
+    private final int numTimesToThrowOnWrite;
+
+    public TestFileIoProvider(
+        int numTimesToThrowOnCreate, int numTimesToThrowOnWrite) {
+      this.numTimesToThrowOnCreate = numTimesToThrowOnCreate;
+      this.numTimesToThrowOnWrite = numTimesToThrowOnWrite;
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public FileOutputStream get(File f) throws FileNotFoundException {
+      if (numCreateCalls.getAndIncrement() < numTimesToThrowOnCreate) {
+        throw new FileNotFoundException("Dummy exception for testing");
+      }
+      // Can't mock final class FileOutputStream.
+      return new FileOutputStream(f);
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public void write(FileOutputStream fos, byte[] data) throws IOException {
+      if (numWriteCalls.getAndIncrement() < numTimesToThrowOnWrite) {
+        throw new IOException("Dummy exception for testing");
+      }
+      fos.write(data);
+    }
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org