You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hdfs-commits@hadoop.apache.org by cm...@apache.org on 2013/11/08 04:00:20 UTC

svn commit: r1539909 - in /hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs: ./ src/main/java/org/apache/hadoop/hdfs/client/ src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/...

Author: cmccabe
Date: Fri Nov  8 03:00:19 2013
New Revision: 1539909

URL: http://svn.apache.org/r1539909
Log:
HDFS-5394: Fix race conditions in DN caching and uncaching (cmccabe)

Modified:
    hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
    hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/client/ClientMmap.java
    hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java
    hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetCache.java
    hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java
    hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java
    hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/MappableBlock.java
    hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java

Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt?rev=1539909&r1=1539908&r2=1539909&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt Fri Nov  8 03:00:19 2013
@@ -359,6 +359,8 @@ Trunk (Unreleased)
     HDFS-5468. CacheAdmin help command does not recognize commands  (Stephen
     Chu via Colin Patrick McCabe)
 
+    HDFS-5394. Fix race conditions in DN caching and uncaching (cmccabe)
+
 Release 2.3.0 - UNRELEASED
 
   INCOMPATIBLE CHANGES

Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/client/ClientMmap.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/client/ClientMmap.java?rev=1539909&r1=1539908&r2=1539909&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/client/ClientMmap.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/client/ClientMmap.java Fri Nov  8 03:00:19 2013
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.hdfs.protocol.DatanodeID;
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
+import org.apache.hadoop.io.nativeio.NativeIO;
 
 import java.io.IOException;
 import java.lang.ref.WeakReference;
@@ -147,20 +148,9 @@ public class ClientMmap {
 
   /**
    * Unmap the memory region.
-   *
-   * There isn't any portable way to unmap a memory region in Java.
-   * So we use the sun.nio method here.
-   * Note that unmapping a memory region could cause crashes if code
-   * continues to reference the unmapped code.  However, if we don't
-   * manually unmap the memory, we are dependent on the finalizer to
-   * do it, and we have no idea when the finalizer will run.
    */
   void unmap() {
     assert(refCount.get() == 0);
-    if (map instanceof sun.nio.ch.DirectBuffer) {
-      final sun.misc.Cleaner cleaner =
-          ((sun.nio.ch.DirectBuffer) map).cleaner();
-      cleaner.clean();
-    }
+    NativeIO.POSIX.munmap(map);
   }
 }

Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java?rev=1539909&r1=1539908&r2=1539909&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java Fri Nov  8 03:00:19 2013
@@ -47,7 +47,6 @@ import org.apache.hadoop.ipc.Server;
 import org.apache.hadoop.net.*;
 import org.apache.hadoop.net.NetworkTopology.InvalidTopologyException;
 import org.apache.hadoop.util.Daemon;
-import org.apache.hadoop.util.IntrusiveCollection;
 import org.apache.hadoop.util.ReflectionUtils;
 import org.apache.hadoop.util.Time;
 

Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetCache.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetCache.java?rev=1539909&r1=1539908&r2=1539909&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetCache.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetCache.java Fri Nov  8 03:00:19 2013
@@ -18,24 +18,35 @@
 
 package org.apache.hadoop.hdfs.server.datanode.fsdataset.impl;
 
+import com.google.common.base.Preconditions;
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
+
 import java.io.FileInputStream;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentMap;
+import java.util.Map.Entry;
+import java.util.concurrent.Executor;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicLong;
 
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.builder.HashCodeBuilder;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.ChecksumException;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
-import org.apache.hadoop.hdfs.protocol.Block;
 import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
+import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
+import org.apache.hadoop.io.nativeio.NativeIO;
 
 /**
  * Manages caching for an FsDatasetImpl by using the mmap(2) and mlock(2)
@@ -45,178 +56,411 @@ import org.apache.hadoop.hdfs.protocol.B
 @InterfaceAudience.Private
 @InterfaceStability.Unstable
 public class FsDatasetCache {
+  /**
+   * Keys which identify MappableBlocks.
+   */
+  private static final class Key {
+    /**
+     * Block id.
+     */
+    final long id;
+
+    /**
+     * Block pool id.
+     */
+    final String bpid;
+
+    Key(long id, String bpid) {
+      this.id = id;
+      this.bpid = bpid;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (o == null) {
+        return false;
+      }
+      if (!(o.getClass() == getClass())) {
+        return false;
+      }
+      Key other = (Key)o;
+      return ((other.id == this.id) && (other.bpid.equals(this.bpid)));
+    }
+
+    @Override
+    public int hashCode() {
+      return new HashCodeBuilder().append(id).append(bpid).hashCode();
+    }
+  };
+
+  /**
+   * MappableBlocks that we know about.
+   */
+  private static final class Value {
+    final State state;
+    final MappableBlock mappableBlock;
+
+    Value(MappableBlock mappableBlock, State state) {
+      this.mappableBlock = mappableBlock;
+      this.state = state;
+    }
+  }
+
+  private enum State {
+    /**
+     * The MappableBlock is in the process of being cached.
+     */
+    CACHING,
+
+    /**
+     * The MappableBlock was in the process of being cached, but it was
+     * cancelled.  Only the FsDatasetCache#WorkerTask can remove cancelled
+     * MappableBlock objects.
+     */
+    CACHING_CANCELLED,
+
+    /**
+     * The MappableBlock is in the cache.
+     */
+    CACHED,
+
+    /**
+     * The MappableBlock is in the process of uncaching.
+     */
+    UNCACHING;
+
+    /**
+     * Whether we should advertise this block as cached to the NameNode and
+     * clients.
+     */
+    public boolean shouldAdvertise() {
+      return (this == CACHED);
+    }
+  }
 
   private static final Log LOG = LogFactory.getLog(FsDatasetCache.class);
 
   /**
-   * Map of cached blocks
+   * Stores MappableBlock objects and the states they're in.
    */
-  private final ConcurrentMap<Long, MappableBlock> cachedBlocks;
+  private final HashMap<Key, Value> mappableBlockMap = new HashMap<Key, Value>();
 
   private final FsDatasetImpl dataset;
+
+  private final ThreadPoolExecutor uncachingExecutor;
+
   /**
-   * Number of cached bytes
+   * The approximate amount of cache space in use.
+   *
+   * This number is an overestimate, counting bytes that will be used only
+   * if pending caching operations succeed.  It does not take into account
+   * pending uncaching operations.
+   *
+   * This overestimate is more useful to the NameNode than an underestimate,
+   * since we don't want the NameNode to assign us more replicas than
+   * we can cache, because of the current batch of operations.
    */
-  private AtomicLong usedBytes;
+  private final UsedBytesCount usedBytesCount;
+
+  public static class PageRounder {
+    private final long osPageSize = NativeIO.getOperatingSystemPageSize();
+
+    /**
+     * Round up a number to the operating system page size.
+     */
+    public long round(long count) {
+      long newCount = 
+          (count + (osPageSize - 1)) / osPageSize;
+      return newCount * osPageSize;
+    }
+  }
+
+  private class UsedBytesCount {
+    private final AtomicLong usedBytes = new AtomicLong(0);
+    
+    private PageRounder rounder = new PageRounder();
+
+    /**
+     * Try to reserve more bytes.
+     *
+     * @param count    The number of bytes to add.  We will round this
+     *                 up to the page size.
+     *
+     * @return         The new number of usedBytes if we succeeded;
+     *                 -1 if we failed.
+     */
+    long reserve(long count) {
+      count = rounder.round(count);
+      while (true) {
+        long cur = usedBytes.get();
+        long next = cur + count;
+        if (next > maxBytes) {
+          return -1;
+        }
+        if (usedBytes.compareAndSet(cur, next)) {
+          return next;
+        }
+      }
+    }
+    
+    /**
+     * Release some bytes that we're using.
+     *
+     * @param count    The number of bytes to release.  We will round this
+     *                 up to the page size.
+     *
+     * @return         The new number of usedBytes.
+     */
+    long release(long count) {
+      count = rounder.round(count);
+      return usedBytes.addAndGet(-count);
+    }
+    
+    long get() {
+      return usedBytes.get();
+    }
+  }
+
   /**
-   * Total cache capacity in bytes
+   * The total cache capacity in bytes.
    */
   private final long maxBytes;
 
   public FsDatasetCache(FsDatasetImpl dataset) {
     this.dataset = dataset;
-    this.cachedBlocks = new ConcurrentHashMap<Long, MappableBlock>();
-    this.usedBytes = new AtomicLong(0);
     this.maxBytes = dataset.datanode.getDnConf().getMaxLockedMemory();
-  }
-
-  /**
-   * @return if the block is cached
-   */
-  boolean isCached(String bpid, long blockId) {
-    MappableBlock mapBlock = cachedBlocks.get(blockId);
-    if (mapBlock != null) {
-      return mapBlock.getBlockPoolId().equals(bpid);
-    }
-    return false;
+    ThreadFactory workerFactory = new ThreadFactoryBuilder()
+        .setDaemon(true)
+        .setNameFormat("FsDatasetCache-%d-" + dataset.toString())
+        .build();
+    this.usedBytesCount = new UsedBytesCount();
+    this.uncachingExecutor = new ThreadPoolExecutor(
+            0, 1,
+            60, TimeUnit.SECONDS,
+            new LinkedBlockingQueue<Runnable>(),
+            workerFactory);
+    this.uncachingExecutor.allowCoreThreadTimeOut(true);
   }
 
   /**
    * @return List of cached blocks suitable for translation into a
    * {@link BlockListAsLongs} for a cache report.
    */
-  List<Long> getCachedBlocks(String bpid) {
+  synchronized List<Long> getCachedBlocks(String bpid) {
     List<Long> blocks = new ArrayList<Long>();
-    // ConcurrentHashMap iteration doesn't see latest updates, which is okay
-    Iterator<MappableBlock> it = cachedBlocks.values().iterator();
-    while (it.hasNext()) {
-      MappableBlock mapBlock = it.next();
-      if (mapBlock.getBlockPoolId().equals(bpid)) {
-        blocks.add(mapBlock.getBlock().getBlockId());
+    for (Iterator<Entry<Key, Value>> iter =
+        mappableBlockMap.entrySet().iterator(); iter.hasNext(); ) {
+      Entry<Key, Value> entry = iter.next();
+      if (entry.getKey().bpid.equals(bpid)) {
+        if (entry.getValue().state.shouldAdvertise()) {
+          blocks.add(entry.getKey().id);
+        }
       }
     }
     return blocks;
   }
 
   /**
-   * Asynchronously attempts to cache a block. This is subject to the
-   * configured maximum locked memory limit.
-   * 
-   * @param block block to cache
-   * @param volume volume of the block
-   * @param blockIn stream of the block's data file
-   * @param metaIn stream of the block's meta file
-   */
-  void cacheBlock(String bpid, Block block, FsVolumeImpl volume,
-      FileInputStream blockIn, FileInputStream metaIn) {
-    if (isCached(bpid, block.getBlockId())) {
-      return;
-    }
-    MappableBlock mapBlock = null;
-    try {
-      mapBlock = new MappableBlock(bpid, block, volume, blockIn, metaIn);
-    } catch (IOException e) {
-      LOG.warn("Failed to cache replica " + block + ": Could not instantiate"
-          + " MappableBlock", e);
-      IOUtils.closeQuietly(blockIn);
-      IOUtils.closeQuietly(metaIn);
-      return;
-    }
-    // Check if there's sufficient cache capacity
-    boolean success = false;
-    long bytes = mapBlock.getNumBytes();
-    long used = usedBytes.get();
-    while (used+bytes < maxBytes) {
-      if (usedBytes.compareAndSet(used, used+bytes)) {
-        success = true;
-        break;
+   * Attempt to begin caching a block.
+   */
+  synchronized void cacheBlock(long blockId, String bpid,
+      String blockFileName, long length, long genstamp,
+      Executor volumeExecutor) {
+    Key key = new Key(blockId, bpid);
+    Value prevValue = mappableBlockMap.get(key);
+    if (prevValue != null) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Block with id " + blockId + ", pool " + bpid +
+            " already exists in the FsDatasetCache with state " +
+            prevValue.state);
       }
-      used = usedBytes.get();
-    }
-    if (!success) {
-      LOG.warn(String.format(
-          "Failed to cache replica %s: %s exceeded (%d + %d > %d)",
-          mapBlock.getBlock().toString(),
-          DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY,
-          used, bytes, maxBytes));
-      mapBlock.close();
       return;
     }
-    // Submit it to the worker pool to be cached
-    volume.getExecutor().execute(new WorkerTask(mapBlock));
+    mappableBlockMap.put(key, new Value(null, State.CACHING));
+    volumeExecutor.execute(
+        new CachingTask(key, blockFileName, length, genstamp));
   }
 
-  /**
-   * Uncaches a block if it is cached.
-   * @param blockId id to uncache
-   */
-  void uncacheBlock(String bpid, long blockId) {
-    MappableBlock mapBlock = cachedBlocks.get(blockId);
-    if (mapBlock != null &&
-        mapBlock.getBlockPoolId().equals(bpid) &&
-        mapBlock.getBlock().getBlockId() == blockId) {
-      mapBlock.close();
-      cachedBlocks.remove(blockId);
-      long bytes = mapBlock.getNumBytes();
-      long used = usedBytes.get();
-      while (!usedBytes.compareAndSet(used, used - bytes)) {
-        used = usedBytes.get();
-      }
-      LOG.info("Successfully uncached block " + blockId);
-    } else {
-      LOG.info("Could not uncache block " + blockId + ": unknown block.");
+  synchronized void uncacheBlock(String bpid, long blockId) {
+    Key key = new Key(blockId, bpid);
+    Value prevValue = mappableBlockMap.get(key);
+
+    if (prevValue == null) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Block with id " + blockId + ", pool " + bpid + " " +
+            "does not need to be uncached, because it is not currently " +
+            "in the mappableBlockMap.");
+      }
+      return;
+    }
+    switch (prevValue.state) {
+    case CACHING:
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Cancelling caching for block with id " + blockId +
+            ", pool " + bpid + ".");
+      }
+      mappableBlockMap.put(key,
+          new Value(prevValue.mappableBlock, State.CACHING_CANCELLED));
+      break;
+    case CACHED:
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Block with id " + blockId + ", pool " + bpid + " " +
+            "has been scheduled for uncaching.");
+      }
+      mappableBlockMap.put(key,
+          new Value(prevValue.mappableBlock, State.UNCACHING));
+      uncachingExecutor.execute(new UncachingTask(key));
+      break;
+    default:
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Block with id " + blockId + ", pool " + bpid + " " +
+            "does not need to be uncached, because it is " +
+            "in state " + prevValue.state + ".");
+      }
+      break;
     }
   }
 
   /**
    * Background worker that mmaps, mlocks, and checksums a block
    */
-  private class WorkerTask implements Runnable {
-
-    private MappableBlock block;
-    WorkerTask(MappableBlock block) {
-      this.block = block;
+  private class CachingTask implements Runnable {
+    private final Key key; 
+    private final String blockFileName;
+    private final long length;
+    private final long genstamp;
+
+    CachingTask(Key key, String blockFileName, long length, long genstamp) {
+      this.key = key;
+      this.blockFileName = blockFileName;
+      this.length = length;
+      this.genstamp = genstamp;
     }
 
     @Override
     public void run() {
       boolean success = false;
+      FileInputStream blockIn = null, metaIn = null;
+      MappableBlock mappableBlock = null;
+      ExtendedBlock extBlk =
+          new ExtendedBlock(key.bpid, key.id, length, genstamp);
+      long newUsedBytes = usedBytesCount.reserve(length);
+      if (newUsedBytes < 0) {
+        LOG.warn("Failed to cache block id " + key.id + ", pool " + key.bpid +
+            ": could not reserve " + length + " more bytes in the " +
+            "cache: " + DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY +
+            " of " + maxBytes + " exceeded."); 
+        return;
+      }
       try {
-        block.map();
-        block.lock();
-        block.verifyChecksum();
+        try {
+          blockIn = (FileInputStream)dataset.getBlockInputStream(extBlk, 0);
+          metaIn = (FileInputStream)dataset.getMetaDataInputStream(extBlk)
+              .getWrappedStream();
+        } catch (ClassCastException e) {
+          LOG.warn("Failed to cache block with id " + key.id + ", pool " +
+                key.bpid + ": Underlying blocks are not backed by files.", e);
+          return;
+        } catch (FileNotFoundException e) {
+          LOG.info("Failed to cache block with id " + key.id + ", pool " +
+                key.bpid + ": failed to find backing files.");
+          return;
+        } catch (IOException e) {
+          LOG.warn("Failed to cache block with id " + key.id + ", pool " +
+                key.bpid + ": failed to open file", e);
+          return;
+        }
+        try {
+          mappableBlock = MappableBlock.
+              load(length, blockIn, metaIn, blockFileName);
+        } catch (ChecksumException e) {
+          // Exception message is bogus since this wasn't caused by a file read
+          LOG.warn("Failed to cache block " + key.id + " in " + key.bpid + ": " +
+                   "checksum verification failed.");
+          return;
+        } catch (IOException e) {
+          LOG.warn("Failed to cache block " + key.id + " in " + key.bpid, e);
+          return;
+        }
+        synchronized (FsDatasetCache.this) {
+          Value value = mappableBlockMap.get(key);
+          Preconditions.checkNotNull(value);
+          Preconditions.checkState(value.state == State.CACHING ||
+                                   value.state == State.CACHING_CANCELLED);
+          if (value.state == State.CACHING_CANCELLED) {
+            mappableBlockMap.remove(key);
+            LOG.warn("Caching of block " + key.id + " in " + key.bpid +
+                " was cancelled.");
+            return;
+          }
+          mappableBlockMap.put(key, new Value(mappableBlock, State.CACHED));
+        }
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Successfully cached block " + key.id + " in " + key.bpid +
+              ".  We are now caching " + newUsedBytes + " bytes in total.");
+        }
         success = true;
-      } catch (ChecksumException e) {
-        // Exception message is bogus since this wasn't caused by a file read
-        LOG.warn("Failed to cache block " + block.getBlock() + ": Checksum "
-            + "verification failed.");
-      } catch (IOException e) {
-        LOG.warn("Failed to cache block " + block.getBlock() + ": IOException",
-            e);
-      }
-      // If we failed or the block became uncacheable in the meantime,
-      // clean up and return the reserved cache allocation 
-      if (!success || 
-          !dataset.validToCache(block.getBlockPoolId(),
-              block.getBlock().getBlockId())) {
-        block.close();
-        long used = usedBytes.get();
-        while (!usedBytes.compareAndSet(used, used-block.getNumBytes())) {
-          used = usedBytes.get();
-        }
-      } else {
-        LOG.info("Successfully cached block " + block.getBlock());
-        cachedBlocks.put(block.getBlock().getBlockId(), block);
+      } finally {
+        if (!success) {
+          newUsedBytes = usedBytesCount.release(length);
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Caching of block " + key.id + " in " +
+              key.bpid + " was aborted.  We are now caching only " +
+              newUsedBytes + " + bytes in total.");
+          }
+          IOUtils.closeQuietly(blockIn);
+          IOUtils.closeQuietly(metaIn);
+          if (mappableBlock != null) {
+            mappableBlock.close();
+          }
+        }
+      }
+    }
+  }
+
+  private class UncachingTask implements Runnable {
+    private final Key key; 
+
+    UncachingTask(Key key) {
+      this.key = key;
+    }
+
+    @Override
+    public void run() {
+      Value value;
+      
+      synchronized (FsDatasetCache.this) {
+        value = mappableBlockMap.get(key);
+      }
+      Preconditions.checkNotNull(value);
+      Preconditions.checkArgument(value.state == State.UNCACHING);
+      // TODO: we will eventually need to do revocation here if any clients
+      // are reading via mmap with checksums enabled.  See HDFS-5182.
+      IOUtils.closeQuietly(value.mappableBlock);
+      synchronized (FsDatasetCache.this) {
+        mappableBlockMap.remove(key);
+      }
+      long newUsedBytes =
+          usedBytesCount.release(value.mappableBlock.getLength());
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Uncaching of block " + key.id + " in " + key.bpid +
+            " completed.  usedBytes = " + newUsedBytes);
       }
     }
   }
 
   // Stats related methods for FsDatasetMBean
 
+  /**
+   * Get the approximate amount of cache space used.
+   */
   public long getDnCacheUsed() {
-    return usedBytes.get();
+    return usedBytesCount.get();
   }
 
+  /**
+   * Get the maximum amount of bytes we can cache.  This is a constant.
+   */
   public long getDnCacheCapacity() {
     return maxBytes;
   }

Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java?rev=1539909&r1=1539908&r2=1539909&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java Fri Nov  8 03:00:19 2013
@@ -32,12 +32,12 @@ import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.Executor;
 
 import javax.management.NotCompliantMBeanException;
 import javax.management.ObjectName;
 import javax.management.StandardMBean;
 
-import org.apache.commons.io.IOUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
@@ -553,7 +553,7 @@ class FsDatasetImpl implements FsDataset
   private synchronized ReplicaBeingWritten append(String bpid,
       FinalizedReplica replicaInfo, long newGS, long estimateBlockLen)
       throws IOException {
-    // uncache the block
+    // If the block is cached, start uncaching it.
     cacheManager.uncacheBlock(bpid, replicaInfo.getBlockId());
     // unlink the finalized replica
     replicaInfo.unlinkBlock(1);
@@ -1168,10 +1168,11 @@ class FsDatasetImpl implements FsDataset
         }
         volumeMap.remove(bpid, invalidBlks[i]);
       }
-
-      // Uncache the block synchronously
+      // If the block is cached, start uncaching it.
       cacheManager.uncacheBlock(bpid, invalidBlks[i].getBlockId());
-      // Delete the block asynchronously to make sure we can do it fast enough
+      // Delete the block asynchronously to make sure we can do it fast enough.
+      // It's ok to unlink the block file before the uncache operation
+      // finishes.
       asyncDiskService.deleteAsync(v, f,
           FsDatasetUtil.getMetaFile(f, invalidBlks[i].getGenerationStamp()),
           new ExtendedBlock(bpid, invalidBlks[i]));
@@ -1181,66 +1182,47 @@ class FsDatasetImpl implements FsDataset
     }
   }
 
-  synchronized boolean validToCache(String bpid, long blockId) {
-    ReplicaInfo info = volumeMap.get(bpid, blockId);
-    if (info == null) {
-      LOG.warn("Failed to cache replica in block pool " + bpid +
-          " with block id " + blockId + ": ReplicaInfo not found.");
-      return false;
-    }
-    FsVolumeImpl volume = (FsVolumeImpl)info.getVolume();
-    if (volume == null) {
-      LOG.warn("Failed to cache block with id " + blockId +
-          ": Volume not found.");
-      return false;
-    }
-    if (info.getState() != ReplicaState.FINALIZED) {
-      LOG.warn("Failed to block with id " + blockId + 
-          ": Replica is not finalized.");
-      return false;
-    }
-    return true;
-  }
-
   /**
    * Asynchronously attempts to cache a single block via {@link FsDatasetCache}.
    */
   private void cacheBlock(String bpid, long blockId) {
-    ReplicaInfo info;
     FsVolumeImpl volume;
+    String blockFileName;
+    long length, genstamp;
+    Executor volumeExecutor;
+
     synchronized (this) {
-      if (!validToCache(bpid, blockId)) {
+      ReplicaInfo info = volumeMap.get(bpid, blockId);
+      if (info == null) {
+        LOG.warn("Failed to cache block with id " + blockId + ", pool " +
+            bpid + ": ReplicaInfo not found.");
         return;
       }
-      info = volumeMap.get(bpid, blockId);
-      volume = (FsVolumeImpl)info.getVolume();
-    }
-    // Try to open block and meta streams
-    FileInputStream blockIn = null;
-    FileInputStream metaIn = null;
-    boolean success = false;
-    ExtendedBlock extBlk =
-        new ExtendedBlock(bpid, blockId,
-            info.getBytesOnDisk(), info.getGenerationStamp());
-    try {
-      blockIn = (FileInputStream)getBlockInputStream(extBlk, 0);
-      metaIn = (FileInputStream)getMetaDataInputStream(extBlk)
-          .getWrappedStream();
-      success = true;
-    } catch (ClassCastException e) {
-      LOG.warn("Failed to cache replica " + extBlk + ": Underlying blocks"
-          + " are not backed by files.", e);
-    } catch (IOException e) {
-      LOG.warn("Failed to cache replica " + extBlk + ": IOException while"
-          + " trying to open block or meta files.", e);
-    }
-    if (!success) {
-      IOUtils.closeQuietly(blockIn);
-      IOUtils.closeQuietly(metaIn);
-      return;
+      if (info.getState() != ReplicaState.FINALIZED) {
+        LOG.warn("Failed to cache block with id " + blockId + ", pool " +
+            bpid + ": replica is not finalized; it is in state " +
+            info.getState());
+        return;
+      }
+      try {
+        volume = (FsVolumeImpl)info.getVolume();
+        if (volume == null) {
+          LOG.warn("Failed to cache block with id " + blockId + ", pool " +
+              bpid + ": volume not found.");
+          return;
+        }
+      } catch (ClassCastException e) {
+        LOG.warn("Failed to cache block with id " + blockId +
+            ": volume was not an instance of FsVolumeImpl.");
+        return;
+      }
+      blockFileName = info.getBlockFile().getAbsolutePath();
+      length = info.getVisibleLength();
+      genstamp = info.getGenerationStamp();
+      volumeExecutor = volume.getCacheExecutor();
     }
-    cacheManager.cacheBlock(bpid, extBlk.getLocalBlock(),
-        volume, blockIn, metaIn);
+    cacheManager.cacheBlock(blockId, bpid, 
+        blockFileName, length, genstamp, volumeExecutor);
   }
 
   @Override // FsDatasetSpi

Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java?rev=1539909&r1=1539908&r2=1539909&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java Fri Nov  8 03:00:19 2013
@@ -18,7 +18,6 @@
 package org.apache.hadoop.hdfs.server.datanode.fsdataset.impl;
 
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
@@ -196,7 +195,7 @@ class FsVolumeImpl implements FsVolumeSp
     return getBlockPoolSlice(bpid).addBlock(b, f);
   }
 
-  Executor getExecutor() {
+  Executor getCacheExecutor() {
     return cacheExecutor;
   }
 

Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/MappableBlock.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/MappableBlock.java?rev=1539909&r1=1539908&r2=1539909&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/MappableBlock.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/MappableBlock.java Fri Nov  8 03:00:19 2013
@@ -28,184 +28,104 @@ import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.channels.FileChannel.MapMode;
 
-import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.ChecksumException;
-import org.apache.hadoop.hdfs.protocol.Block;
 import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
 import org.apache.hadoop.io.nativeio.NativeIO;
 import org.apache.hadoop.util.DataChecksum;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 
 /**
- * Low-level wrapper for a Block and its backing files that provides mmap,
- * mlock, and checksum verification operations.
- * 
- * This could be a private class of FsDatasetCache, not meant for other users.
+ * Represents an HDFS block that is mmapped by the DataNode.
  */
 @InterfaceAudience.Private
 @InterfaceStability.Unstable
-class MappableBlock implements Closeable {
-
-  private final String bpid;
-  private final Block block;
-  private final FsVolumeImpl volume;
-
-  private final FileInputStream blockIn;
-  private final FileInputStream metaIn;
-  private final FileChannel blockChannel;
-  private final FileChannel metaChannel;
-  private final long blockSize;
-
-  private boolean isMapped;
-  private boolean isLocked;
-  private boolean isChecksummed;
-
-  private MappedByteBuffer blockMapped = null;
-
-  public MappableBlock(String bpid, Block blk, FsVolumeImpl volume,
-      FileInputStream blockIn, FileInputStream metaIn) throws IOException {
-    this.bpid = bpid;
-    this.block = blk;
-    this.volume = volume;
-
-    this.blockIn = blockIn;
-    this.metaIn = metaIn;
-    this.blockChannel = blockIn.getChannel();
-    this.metaChannel = metaIn.getChannel();
-    this.blockSize = blockChannel.size();
-
-    this.isMapped = false;
-    this.isLocked = false;
-    this.isChecksummed = false;
-  }
-
-  public String getBlockPoolId() {
-    return bpid;
-  }
-
-  public Block getBlock() {
-    return block;
-  }
-
-  public FsVolumeImpl getVolume() {
-    return volume;
-  }
-
-  public boolean isMapped() {
-    return isMapped;
-  }
-
-  public boolean isLocked() {
-    return isLocked;
-  }
-
-  public boolean isChecksummed() {
-    return isChecksummed;
-  }
-
-  /**
-   * Returns the number of bytes on disk for the block file
-   */
-  public long getNumBytes() {
-    return blockSize;
-  }
-
-  /**
-   * Maps the block into memory. See mmap(2).
-   */
-  public void map() throws IOException {
-    if (isMapped) {
-      return;
+public class MappableBlock implements Closeable {
+  public static interface Mlocker {
+    void mlock(MappedByteBuffer mmap, long length) throws IOException;
+  }
+  
+  private static class PosixMlocker implements Mlocker {
+    public void mlock(MappedByteBuffer mmap, long length)
+        throws IOException {
+      NativeIO.POSIX.mlock(mmap, length);
     }
-    blockMapped = blockChannel.map(MapMode.READ_ONLY, 0, blockSize);
-    isMapped = true;
   }
 
-  /**
-   * Unmaps the block from memory. See munmap(2).
-   */
-  public void unmap() {
-    if (!isMapped) {
-      return;
-    }
-    if (blockMapped instanceof sun.nio.ch.DirectBuffer) {
-      sun.misc.Cleaner cleaner =
-          ((sun.nio.ch.DirectBuffer)blockMapped).cleaner();
-      cleaner.clean();
-    }
-    isMapped = false;
-    isLocked = false;
-    isChecksummed = false;
+  @VisibleForTesting
+  public static Mlocker mlocker = new PosixMlocker();
+
+  private MappedByteBuffer mmap;
+  private final long length;
+
+  MappableBlock(MappedByteBuffer mmap, long length) {
+    this.mmap = mmap;
+    this.length = length;
+    assert length > 0;
+  }
+
+  public long getLength() {
+    return length;
   }
 
   /**
-   * Locks the block into memory. This prevents the block from being paged out.
-   * See mlock(2).
-   */
-  public void lock() throws IOException {
-    Preconditions.checkArgument(isMapped,
-        "Block must be mapped before it can be locked!");
-    if (isLocked) {
-      return;
-    }
-    NativeIO.POSIX.mlock(blockMapped, blockSize);
-    isLocked = true;
-  }
-
-  /**
-   * Unlocks the block from memory, allowing it to be paged out. See munlock(2).
-   */
-  public void unlock() throws IOException {
-    if (!isLocked || !isMapped) {
-      return;
-    }
-    NativeIO.POSIX.munlock(blockMapped, blockSize);
-    isLocked = false;
-    isChecksummed = false;
-  }
-
-  /**
-   * Reads bytes into a buffer until EOF or the buffer's limit is reached
-   */
-  private int fillBuffer(FileChannel channel, ByteBuffer buf)
-      throws IOException {
-    int bytesRead = channel.read(buf);
-    if (bytesRead < 0) {
-      //EOF
-      return bytesRead;
-    }
-    while (buf.remaining() > 0) {
-      int n = channel.read(buf);
-      if (n < 0) {
-        //EOF
-        return bytesRead;
+   * Load the block.
+   *
+   * mmap and mlock the block, and then verify its checksum.
+   *
+   * @param length         The current length of the block.
+   * @param blockIn        The block input stream.  Should be positioned at the
+   *                       start.  The caller must close this.
+   * @param metaIn         The meta file input stream.  Should be positioned at
+   *                       the start.  The caller must close this.
+   * @param blockFileName  The block file name, for logging purposes.
+   *
+   * @return               The Mappable block.
+   */
+  public static MappableBlock load(long length,
+      FileInputStream blockIn, FileInputStream metaIn,
+      String blockFileName) throws IOException {
+    MappableBlock mappableBlock = null;
+    MappedByteBuffer mmap = null;
+    try {
+      FileChannel blockChannel = blockIn.getChannel();
+      if (blockChannel == null) {
+        throw new IOException("Block InputStream has no FileChannel.");
+      }
+      mmap = blockChannel.map(MapMode.READ_ONLY, 0, length);
+      mlocker.mlock(mmap, length);
+      verifyChecksum(length, metaIn, blockChannel, blockFileName);
+      mappableBlock = new MappableBlock(mmap, length);
+    } finally {
+      if (mappableBlock == null) {
+        if (mmap != null) {
+          NativeIO.POSIX.munmap(mmap); // unmapping also unlocks
+        }
       }
-      bytesRead += n;
     }
-    return bytesRead;
+    return mappableBlock;
   }
 
   /**
    * Verifies the block's checksum. This is an I/O intensive operation.
    * @return if the block was successfully checksummed.
    */
-  public void verifyChecksum() throws IOException, ChecksumException {
-    Preconditions.checkArgument(isLocked && isMapped,
-        "Block must be mapped and locked before checksum verification!");
-    // skip if checksum has already been successfully verified
-    if (isChecksummed) {
-      return;
-    }
+  private static void verifyChecksum(long length,
+      FileInputStream metaIn, FileChannel blockChannel, String blockFileName)
+          throws IOException, ChecksumException {
     // Verify the checksum from the block's meta file
     // Get the DataChecksum from the meta file header
-    metaChannel.position(0);
     BlockMetadataHeader header =
         BlockMetadataHeader.readHeader(new DataInputStream(
             new BufferedInputStream(metaIn, BlockMetadataHeader
                 .getHeaderSize())));
+    FileChannel metaChannel = metaIn.getChannel();
+    if (metaChannel == null) {
+      throw new IOException("Block InputStream meta file has no FileChannel.");
+    }
     DataChecksum checksum = header.getChecksum();
     final int bytesPerChecksum = checksum.getBytesPerChecksum();
     final int checksumSize = checksum.getChecksumSize();
@@ -214,13 +134,13 @@ class MappableBlock implements Closeable
     ByteBuffer checksumBuf = ByteBuffer.allocate(numChunks*checksumSize);
     // Verify the checksum
     int bytesVerified = 0;
-    while (bytesVerified < blockChannel.size()) {
+    while (bytesVerified < length) {
       Preconditions.checkState(bytesVerified % bytesPerChecksum == 0,
           "Unexpected partial chunk before EOF");
       assert bytesVerified % bytesPerChecksum == 0;
       int bytesRead = fillBuffer(blockChannel, blockBuf);
       if (bytesRead == -1) {
-        throw new IOException("Premature EOF");
+        throw new IOException("checksum verification failed: premature EOF");
       }
       blockBuf.flip();
       // Number of read chunks, including partial chunk at end
@@ -228,22 +148,41 @@ class MappableBlock implements Closeable
       checksumBuf.limit(chunks*checksumSize);
       fillBuffer(metaChannel, checksumBuf);
       checksumBuf.flip();
-      checksum.verifyChunkedSums(blockBuf, checksumBuf, block.getBlockName(),
+      checksum.verifyChunkedSums(blockBuf, checksumBuf, blockFileName,
           bytesVerified);
       // Success
       bytesVerified += bytesRead;
       blockBuf.clear();
       checksumBuf.clear();
     }
-    isChecksummed = true;
-    // Can close the backing file since everything is safely in memory
-    blockChannel.close();
+  }
+
+  /**
+   * Reads bytes into a buffer until EOF or the buffer's limit is reached
+   */
+  private static int fillBuffer(FileChannel channel, ByteBuffer buf)
+      throws IOException {
+    int bytesRead = channel.read(buf);
+    if (bytesRead < 0) {
+      //EOF
+      return bytesRead;
+    }
+    while (buf.remaining() > 0) {
+      int n = channel.read(buf);
+      if (n < 0) {
+        //EOF
+        return bytesRead;
+      }
+      bytesRead += n;
+    }
+    return bytesRead;
   }
 
   @Override
   public void close() {
-    unmap();
-    IOUtils.closeQuietly(blockIn);
-    IOUtils.closeQuietly(metaIn);
+    if (mmap != null) {
+      NativeIO.POSIX.munmap(mmap);
+      mmap = null;
+    }
   }
 }

Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java?rev=1539909&r1=1539908&r2=1539909&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java Fri Nov  8 03:00:19 2013
@@ -26,8 +26,11 @@ import static org.mockito.Mockito.doRetu
 
 import java.io.FileInputStream;
 import java.io.IOException;
+import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.HdfsBlockLocation;
@@ -42,6 +45,8 @@ import org.apache.hadoop.hdfs.protocol.B
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
 import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
 import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
+import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetCache.PageRounder;
+import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.MappableBlock;
 import org.apache.hadoop.hdfs.server.namenode.FSImage;
 import org.apache.hadoop.hdfs.server.namenode.NameNode;
 import org.apache.hadoop.hdfs.server.protocol.BlockIdCommand;
@@ -52,12 +57,18 @@ import org.apache.hadoop.hdfs.server.pro
 import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
 import org.apache.hadoop.hdfs.server.protocol.StorageReport;
 import org.apache.hadoop.io.nativeio.NativeIO;
+import org.apache.hadoop.test.GenericTestUtils;
 import org.apache.log4j.Logger;
 import org.junit.After;
+import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
+import com.google.common.base.Preconditions;
+import com.google.common.base.Supplier;
+
 public class TestFsDatasetCache {
+  private static final Log LOG = LogFactory.getLog(TestFsDatasetCache.class);
 
   // Most Linux installs allow a default of 64KB locked memory
   private static final long CACHE_CAPACITY = 64 * 1024;
@@ -71,12 +82,14 @@ public class TestFsDatasetCache {
   private static DataNode dn;
   private static FsDatasetSpi<?> fsd;
   private static DatanodeProtocolClientSideTranslatorPB spyNN;
+  private static PageRounder rounder = new PageRounder();
 
   @Before
   public void setUp() throws Exception {
     assumeTrue(!Path.WINDOWS);
-    assumeTrue(NativeIO.isAvailable());
+    assumeTrue(NativeIO.getMemlockLimit() >= CACHE_CAPACITY);
     conf = new HdfsConfiguration();
+    conf.setBoolean(DFSConfigKeys.DFS_NAMENODE_CACHING_ENABLED_KEY, true);
     conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
     conf.setLong(DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY,
         CACHE_CAPACITY);
@@ -169,19 +182,34 @@ public class TestFsDatasetCache {
    * Blocks until cache usage hits the expected new value.
    */
   private long verifyExpectedCacheUsage(final long expected) throws Exception {
-    long cacheUsed = fsd.getDnCacheUsed();
-    while (cacheUsed != expected) {
-      cacheUsed = fsd.getDnCacheUsed();
-      Thread.sleep(100);
-    }
-    assertEquals("Unexpected amount of cache used", expected, cacheUsed);
-    return cacheUsed;
+    GenericTestUtils.waitFor(new Supplier<Boolean>() {
+      private int tries = 0;
+      
+      @Override
+      public Boolean get() {
+        long curDnCacheUsed = fsd.getDnCacheUsed();
+        if (curDnCacheUsed != expected) {
+          if (tries++ > 10) {
+            LOG.info("verifyExpectedCacheUsage: expected " +
+                expected + ", got " + curDnCacheUsed + "; " +
+                "memlock limit = " + NativeIO.getMemlockLimit() +
+                ".  Waiting...");
+          }
+          return false;
+        }
+        return true;
+      }
+    }, 100, 60000);
+    return expected;
   }
 
-  @Test(timeout=60000)
+  @Test(timeout=600000)
   public void testCacheAndUncacheBlock() throws Exception {
+    LOG.info("beginning testCacheAndUncacheBlock");
     final int NUM_BLOCKS = 5;
 
+    verifyExpectedCacheUsage(0);
+
     // Write a test file
     final Path testFile = new Path("/testCacheBlock");
     final long testFileLen = BLOCK_SIZE*NUM_BLOCKS;
@@ -211,15 +239,23 @@ public class TestFsDatasetCache {
       setHeartbeatResponse(uncacheBlock(locs[i]));
       current = verifyExpectedCacheUsage(current - blockSizes[i]);
     }
+    LOG.info("finishing testCacheAndUncacheBlock");
   }
 
-  @Test(timeout=60000)
+  @Test(timeout=600000)
   public void testFilesExceedMaxLockedMemory() throws Exception {
+    LOG.info("beginning testFilesExceedMaxLockedMemory");
+
+    // We don't want to deal with page rounding issues, so skip this
+    // test if page size is weird
+    long osPageSize = NativeIO.getOperatingSystemPageSize();
+    assumeTrue(osPageSize == 4096); 
+
     // Create some test files that will exceed total cache capacity
-    // Don't forget that meta files take up space too!
-    final int numFiles = 4;
-    final long fileSize = CACHE_CAPACITY / numFiles;
-    final Path[] testFiles = new Path[4];
+    final int numFiles = 5;
+    final long fileSize = 15000;
+
+    final Path[] testFiles = new Path[numFiles];
     final HdfsBlockLocation[][] fileLocs = new HdfsBlockLocation[numFiles][];
     final long[] fileSizes = new long[numFiles];
     for (int i=0; i<numFiles; i++) {
@@ -235,35 +271,87 @@ public class TestFsDatasetCache {
     }
 
     // Cache the first n-1 files
-    long current = 0;
+    long total = 0;
+    verifyExpectedCacheUsage(0);
     for (int i=0; i<numFiles-1; i++) {
       setHeartbeatResponse(cacheBlocks(fileLocs[i]));
-      current = verifyExpectedCacheUsage(current + fileSizes[i]);
+      total = verifyExpectedCacheUsage(rounder.round(total + fileSizes[i]));
     }
-    final long oldCurrent = current;
 
     // nth file should hit a capacity exception
     final LogVerificationAppender appender = new LogVerificationAppender();
     final Logger logger = Logger.getRootLogger();
     logger.addAppender(appender);
     setHeartbeatResponse(cacheBlocks(fileLocs[numFiles-1]));
-    int lines = 0;
-    while (lines == 0) {
-      Thread.sleep(100);
-      lines = appender.countLinesWithMessage(
-          DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY + " exceeded");
-    }
 
-    // Uncache the cached part of the nth file
-    setHeartbeatResponse(uncacheBlocks(fileLocs[numFiles-1]));
-    while (fsd.getDnCacheUsed() != oldCurrent) {
-      Thread.sleep(100);
-    }
+    GenericTestUtils.waitFor(new Supplier<Boolean>() {
+      @Override
+      public Boolean get() {
+        int lines = appender.countLinesWithMessage(
+            "more bytes in the cache: " +
+            DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY);
+        return lines > 0;
+      }
+    }, 500, 30000);
 
     // Uncache the n-1 files
     for (int i=0; i<numFiles-1; i++) {
       setHeartbeatResponse(uncacheBlocks(fileLocs[i]));
-      current = verifyExpectedCacheUsage(current - fileSizes[i]);
+      total -= rounder.round(fileSizes[i]);
+      verifyExpectedCacheUsage(total);
+    }
+    LOG.info("finishing testFilesExceedMaxLockedMemory");
+  }
+
+  @Test(timeout=600000)
+  public void testUncachingBlocksBeforeCachingFinishes() throws Exception {
+    LOG.info("beginning testUncachingBlocksBeforeCachingFinishes");
+    final int NUM_BLOCKS = 5;
+
+    verifyExpectedCacheUsage(0);
+
+    // Write a test file
+    final Path testFile = new Path("/testCacheBlock");
+    final long testFileLen = BLOCK_SIZE*NUM_BLOCKS;
+    DFSTestUtil.createFile(fs, testFile, testFileLen, (short)1, 0xABBAl);
+
+    // Get the details of the written file
+    HdfsBlockLocation[] locs =
+        (HdfsBlockLocation[])fs.getFileBlockLocations(testFile, 0, testFileLen);
+    assertEquals("Unexpected number of blocks", NUM_BLOCKS, locs.length);
+    final long[] blockSizes = getBlockSizes(locs);
+
+    // Check initial state
+    final long cacheCapacity = fsd.getDnCacheCapacity();
+    long cacheUsed = fsd.getDnCacheUsed();
+    long current = 0;
+    assertEquals("Unexpected cache capacity", CACHE_CAPACITY, cacheCapacity);
+    assertEquals("Unexpected amount of cache used", current, cacheUsed);
+
+    MappableBlock.mlocker = new MappableBlock.Mlocker() {
+      @Override
+      public void mlock(MappedByteBuffer mmap, long length) throws IOException {
+        LOG.info("An mlock operation is starting.");
+        try {
+          Thread.sleep(3000);
+        } catch (InterruptedException e) {
+          Assert.fail();
+        }
+      }
+    };
+    // Starting caching each block in succession.  The usedBytes amount
+    // should increase, even though caching doesn't complete on any of them.
+    for (int i=0; i<NUM_BLOCKS; i++) {
+      setHeartbeatResponse(cacheBlock(locs[i]));
+      current = verifyExpectedCacheUsage(current + blockSizes[i]);
     }
+    
+    setHeartbeatResponse(new DatanodeCommand[] {
+      getResponse(locs, DatanodeProtocol.DNA_UNCACHE)
+    });
+
+    // wait until all caching jobs are finished cancelling.
+    current = verifyExpectedCacheUsage(0);
+    LOG.info("finishing testUncachingBlocksBeforeCachingFinishes");
   }
 }