You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hdfs-commits@hadoop.apache.org by to...@apache.org on 2012/02/23 02:16:33 UTC

svn commit: r1292609 - in /hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs: ./ src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ src/main/java/org/apache/hadoop/hdfs/server/namenode/ src/test/java/org/apache/hadoop/hdfs/ src/test/java/or...

Author: todd
Date: Thu Feb 23 01:16:33 2012
New Revision: 1292609

URL: http://svn.apache.org/viewvc?rev=1292609&view=rev
Log:
HDFS-2985. Improve logging when replicas are marked as corrupt. Contributed by Todd Lipcon.

Modified:
    hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
    hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
    hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java
    hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
    hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java
    hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java
    hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java

Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt?rev=1292609&r1=1292608&r2=1292609&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt Thu Feb 23 01:16:33 2012
@@ -241,6 +241,8 @@ Release 0.23.2 - UNRELEASED 
     HDFS-2907.  Add a conf property dfs.datanode.fsdataset.factory to make
     FSDataset in Datanode pluggable.  (szetszwo)
 
+    HDFS-2985. Improve logging when replicas are marked as corrupt. (todd)
+
   OPTIMIZATIONS
 
   BUG FIXES

Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java?rev=1292609&r1=1292608&r2=1292609&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java Thu Feb 23 01:16:33 2012
@@ -804,9 +804,11 @@ public class BlockManager {
    * Mark the block belonging to datanode as corrupt
    * @param blk Block to be marked as corrupt
    * @param dn Datanode which holds the corrupt replica
+   * @param reason a textual reason why the block should be marked corrupt,
+   * for logging purposes
    */
   public void findAndMarkBlockAsCorrupt(final ExtendedBlock blk,
-      final DatanodeInfo dn) throws IOException {
+      final DatanodeInfo dn, String reason) throws IOException {
     namesystem.writeLock();
     try {
       final BlockInfo storedBlock = getStoredBlock(blk.getLocalBlock());
@@ -819,14 +821,15 @@ public class BlockManager {
             + blk + " not found.");
         return;
       }
-      markBlockAsCorrupt(storedBlock, dn);
+      markBlockAsCorrupt(storedBlock, dn, reason);
     } finally {
       namesystem.writeUnlock();
     }
   }
 
   private void markBlockAsCorrupt(BlockInfo storedBlock,
-                                  DatanodeInfo dn) throws IOException {
+                                  DatanodeInfo dn,
+                                  String reason) throws IOException {
     assert storedBlock != null : "storedBlock should not be null";
     DatanodeDescriptor node = getDatanodeManager().getDatanode(dn);
     if (node == null) {
@@ -850,7 +853,7 @@ public class BlockManager {
     node.addBlock(storedBlock);
 
     // Add this replica to corruptReplicas Map
-    corruptReplicas.addToCorruptReplicasMap(storedBlock, node);
+    corruptReplicas.addToCorruptReplicasMap(storedBlock, node, reason);
     if (countNodes(storedBlock).liveReplicas() >= inode.getReplication()) {
       // the block is over-replicated so invalidate the replicas immediately
       invalidateBlock(storedBlock, node);
@@ -1277,6 +1280,21 @@ public class BlockManager {
       this.reportedState = reportedState;
     }
   }
+  
+  /**
+   * BlockToMarkCorrupt is used to build the "toCorrupt" list, which is a
+   * list of blocks that should be considered corrupt due to a block report.
+   */
+  private static class BlockToMarkCorrupt {
+    final BlockInfo blockInfo;
+    final String reason;
+    
+    BlockToMarkCorrupt(BlockInfo blockInfo, String reason) {
+      super();
+      this.blockInfo = blockInfo;
+      this.reason = reason;
+    }
+  }
 
   /**
    * The given datanode is reporting all its blocks.
@@ -1331,7 +1349,7 @@ public class BlockManager {
     Collection<BlockInfo> toAdd = new LinkedList<BlockInfo>();
     Collection<Block> toRemove = new LinkedList<Block>();
     Collection<Block> toInvalidate = new LinkedList<Block>();
-    Collection<BlockInfo> toCorrupt = new LinkedList<BlockInfo>();
+    Collection<BlockToMarkCorrupt> toCorrupt = new LinkedList<BlockToMarkCorrupt>();
     Collection<StatefulBlockInfo> toUC = new LinkedList<StatefulBlockInfo>();
     reportDiff(node, report, toAdd, toRemove, toInvalidate, toCorrupt, toUC);
 
@@ -1351,8 +1369,8 @@ public class BlockManager {
           + " does not belong to any file.");
       addToInvalidates(b, node);
     }
-    for (BlockInfo b : toCorrupt) {
-      markBlockAsCorrupt(b, node);
+    for (BlockToMarkCorrupt b : toCorrupt) {
+      markBlockAsCorrupt(b.blockInfo, node, b.reason);
     }
   }
 
@@ -1383,8 +1401,10 @@ public class BlockManager {
       
       // If block is corrupt, mark it and continue to next block.
       BlockUCState ucState = storedBlock.getBlockUCState();
-      if (isReplicaCorrupt(iblk, reportedState, storedBlock, ucState, node)) {
-        markBlockAsCorrupt(storedBlock, node);
+      BlockToMarkCorrupt c = checkReplicaCorrupt(
+          iblk, reportedState, storedBlock, ucState, node);
+      if (c != null) {
+        markBlockAsCorrupt(c.blockInfo, node, c.reason);
         continue;
       }
       
@@ -1406,7 +1426,7 @@ public class BlockManager {
       Collection<BlockInfo> toAdd,              // add to DatanodeDescriptor
       Collection<Block> toRemove,           // remove from DatanodeDescriptor
       Collection<Block> toInvalidate,       // should be removed from DN
-      Collection<BlockInfo> toCorrupt,      // add to corrupt replicas list
+      Collection<BlockToMarkCorrupt> toCorrupt, // add to corrupt replicas list
       Collection<StatefulBlockInfo> toUC) { // add to under-construction list
     // place a delimiter in the list which separates blocks 
     // that have been reported from those that have not
@@ -1473,7 +1493,7 @@ public class BlockManager {
       final Block block, final ReplicaState reportedState, 
       final Collection<BlockInfo> toAdd, 
       final Collection<Block> toInvalidate, 
-      final Collection<BlockInfo> toCorrupt,
+      final Collection<BlockToMarkCorrupt> toCorrupt,
       final Collection<StatefulBlockInfo> toUC) {
     
     if(LOG.isDebugEnabled()) {
@@ -1504,8 +1524,10 @@ public class BlockManager {
       return storedBlock;
     }
 
-    if (isReplicaCorrupt(block, reportedState, storedBlock, ucState, dn)) {
-      toCorrupt.add(storedBlock);
+    BlockToMarkCorrupt c = checkReplicaCorrupt(
+        block, reportedState, storedBlock, ucState, dn);
+    if (c != null) {
+      toCorrupt.add(c);
       return storedBlock;
     }
 
@@ -1529,8 +1551,11 @@ public class BlockManager {
    * as switch statements, on the theory that it is easier to understand
    * the combinatorics of reportedState and ucState that way.  It should be
    * at least as efficient as boolean expressions.
+   * 
+   * @return a BlockToMarkCorrupt object, or null if the replica is not corrupt
    */
-  private boolean isReplicaCorrupt(Block iblk, ReplicaState reportedState, 
+  private BlockToMarkCorrupt checkReplicaCorrupt(
+      Block iblk, ReplicaState reportedState, 
       BlockInfo storedBlock, BlockUCState ucState, 
       DatanodeDescriptor dn) {
     switch(reportedState) {
@@ -1538,17 +1563,31 @@ public class BlockManager {
       switch(ucState) {
       case COMPLETE:
       case COMMITTED:
-        return (storedBlock.getGenerationStamp() != iblk.getGenerationStamp()
-            || storedBlock.getNumBytes() != iblk.getNumBytes());
+        if (storedBlock.getGenerationStamp() != iblk.getGenerationStamp()) {
+          return new BlockToMarkCorrupt(storedBlock,
+              "block is " + ucState + " and reported genstamp " +
+              iblk.getGenerationStamp() + " does not match " +
+              "genstamp in block map " + storedBlock.getGenerationStamp());
+        } else if (storedBlock.getNumBytes() != iblk.getNumBytes()) {
+          return new BlockToMarkCorrupt(storedBlock,
+              "block is " + ucState + " and reported length " +
+              iblk.getNumBytes() + " does not match " +
+              "length in block map " + storedBlock.getNumBytes());
+        } else {
+          return null; // not corrupt
+        }
       default:
-        return false;
+        return null;
       }
     case RBW:
     case RWR:
       if (!storedBlock.isComplete()) {
-        return false;
+        return null; // not corrupt
       } else if (storedBlock.getGenerationStamp() != iblk.getGenerationStamp()) {
-        return true;
+        return new BlockToMarkCorrupt(storedBlock,
+            "reported " + reportedState + " replica with genstamp " +
+            iblk.getGenerationStamp() + " does not match COMPLETE block's " +
+            "genstamp in block map " + storedBlock.getGenerationStamp());
       } else { // COMPLETE block, same genstamp
         if (reportedState == ReplicaState.RBW) {
           // If it's a RBW report for a COMPLETE block, it may just be that
@@ -1558,18 +1597,22 @@ public class BlockManager {
           LOG.info("Received an RBW replica for block " + storedBlock +
               " on " + dn.getName() + ": ignoring it, since the block is " +
               "complete with the same generation stamp.");
-          return false;
+          return null;
         } else {
-          return true;
+          return new BlockToMarkCorrupt(storedBlock,
+              "reported replica has invalid state " + reportedState);
         }
       }
     case RUR:       // should not be reported
     case TEMPORARY: // should not be reported
     default:
-      LOG.warn("Unexpected replica state " + reportedState
-          + " for block: " + storedBlock + 
-          " on " + dn.getName() + " size " + storedBlock.getNumBytes());
-      return true;
+      String msg = "Unexpected replica state " + reportedState
+      + " for block: " + storedBlock + 
+      " on " + dn.getName() + " size " + storedBlock.getNumBytes();
+      // log here at WARN level since this is really a broken HDFS
+      // invariant
+      LOG.warn(msg);
+      return new BlockToMarkCorrupt(storedBlock, msg);
     }
   }
 
@@ -2100,7 +2143,7 @@ public class BlockManager {
     // blockReceived reports a finalized block
     Collection<BlockInfo> toAdd = new LinkedList<BlockInfo>();
     Collection<Block> toInvalidate = new LinkedList<Block>();
-    Collection<BlockInfo> toCorrupt = new LinkedList<BlockInfo>();
+    Collection<BlockToMarkCorrupt> toCorrupt = new LinkedList<BlockToMarkCorrupt>();
     Collection<StatefulBlockInfo> toUC = new LinkedList<StatefulBlockInfo>();
     processReportedBlock(node, block, ReplicaState.FINALIZED,
                               toAdd, toInvalidate, toCorrupt, toUC);
@@ -2121,8 +2164,8 @@ public class BlockManager {
           + " does not belong to any file.");
       addToInvalidates(b, node);
     }
-    for (BlockInfo b : toCorrupt) {
-      markBlockAsCorrupt(b, node);
+    for (BlockToMarkCorrupt b : toCorrupt) {
+      markBlockAsCorrupt(b.blockInfo, node, b.reason);
     }
   }
 

Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java?rev=1292609&r1=1292608&r2=1292609&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java Thu Feb 23 01:16:33 2012
@@ -44,25 +44,37 @@ public class CorruptReplicasMap{
    *
    * @param blk Block to be added to CorruptReplicasMap
    * @param dn DatanodeDescriptor which holds the corrupt replica
+   * @param reason a textual reason (for logging purposes)
    */
-  public void addToCorruptReplicasMap(Block blk, DatanodeDescriptor dn) {
+  public void addToCorruptReplicasMap(Block blk, DatanodeDescriptor dn,
+      String reason) {
     Collection<DatanodeDescriptor> nodes = getNodes(blk);
     if (nodes == null) {
       nodes = new TreeSet<DatanodeDescriptor>();
       corruptReplicasMap.put(blk, nodes);
     }
+    
+    String reasonText;
+    if (reason != null) {
+      reasonText = " because " + reason;
+    } else {
+      reasonText = "";
+    }
+    
     if (!nodes.contains(dn)) {
       nodes.add(dn);
       NameNode.stateChangeLog.info("BLOCK NameSystem.addToCorruptReplicasMap: "+
                                    blk.getBlockName() +
                                    " added as corrupt on " + dn.getName() +
-                                   " by " + Server.getRemoteIp());
+                                   " by " + Server.getRemoteIp() +
+                                   reasonText);
     } else {
       NameNode.stateChangeLog.info("BLOCK NameSystem.addToCorruptReplicasMap: "+
                                    "duplicate requested for " + 
                                    blk.getBlockName() + " to add as corrupt " +
                                    "on " + dn.getName() +
-                                   " by " + Server.getRemoteIp());
+                                   " by " + Server.getRemoteIp() +
+                                   reasonText);
     }
   }
 

Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java?rev=1292609&r1=1292608&r2=1292609&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java Thu Feb 23 01:16:33 2012
@@ -549,7 +549,8 @@ class NameNodeRpcServer implements Namen
       DatanodeInfo[] nodes = blocks[i].getLocations();
       for (int j = 0; j < nodes.length; j++) {
         DatanodeInfo dn = nodes[j];
-        namesystem.getBlockManager().findAndMarkBlockAsCorrupt(blk, dn);
+        namesystem.getBlockManager().findAndMarkBlockAsCorrupt(blk, dn,
+            "client machine reported it");
       }
     }
   }

Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java?rev=1292609&r1=1292608&r2=1292609&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java Thu Feb 23 01:16:33 2012
@@ -147,7 +147,7 @@ public class TestFileCorruption extends 
       DatanodeRegistration dnR = 
         DataNodeTestUtils.getDNRegistrationForBP(dataNode, blk.getBlockPoolId());
       cluster.getNamesystem().getBlockManager().findAndMarkBlockAsCorrupt(
-          blk, new DatanodeInfo(dnR));
+          blk, new DatanodeInfo(dnR), "TEST");
       
       // open the file
       fs.open(FILE_PATH);

Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java?rev=1292609&r1=1292608&r2=1292609&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java Thu Feb 23 01:16:33 2012
@@ -83,14 +83,14 @@ public class TestCorruptReplicaInfo exte
       DatanodeDescriptor dn1 = new DatanodeDescriptor();
       DatanodeDescriptor dn2 = new DatanodeDescriptor();
       
-      crm.addToCorruptReplicasMap(getBlock(0), dn1);
+      crm.addToCorruptReplicasMap(getBlock(0), dn1, "TEST");
       assertEquals("Number of corrupt blocks not returning correctly",
                    1, crm.size());
-      crm.addToCorruptReplicasMap(getBlock(1), dn1);
+      crm.addToCorruptReplicasMap(getBlock(1), dn1, "TEST");
       assertEquals("Number of corrupt blocks not returning correctly",
                    2, crm.size());
       
-      crm.addToCorruptReplicasMap(getBlock(1), dn2);
+      crm.addToCorruptReplicasMap(getBlock(1), dn2, "TEST");
       assertEquals("Number of corrupt blocks not returning correctly",
                    2, crm.size());
       
@@ -103,7 +103,7 @@ public class TestCorruptReplicaInfo exte
                    0, crm.size());
       
       for (Long block_id: block_ids) {
-        crm.addToCorruptReplicasMap(getBlock(block_id), dn1);
+        crm.addToCorruptReplicasMap(getBlock(block_id), dn1, "TEST");
       }
             
       assertEquals("Number of corrupt blocks not returning correctly",

Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java?rev=1292609&r1=1292608&r2=1292609&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java Thu Feb 23 01:16:33 2012
@@ -172,7 +172,8 @@ public class TestNameNodeMetrics {
     // Corrupt first replica of the block
     LocatedBlock block = NameNodeAdapter.getBlockLocations(
         cluster.getNameNode(), file.toString(), 0, 1).get(0);
-    bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0]);
+    bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0],
+        "TEST");
     updateMetrics();
     MetricsRecordBuilder rb = getMetrics(NS_METRICS);
     assertGauge("CorruptBlocks", 1L, rb);
@@ -211,7 +212,8 @@ public class TestNameNodeMetrics {
     // Corrupt the only replica of the block to result in a missing block
     LocatedBlock block = NameNodeAdapter.getBlockLocations(
         cluster.getNameNode(), file.toString(), 0, 1).get(0);
-    bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0]);
+    bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0],
+        "TEST");
     updateMetrics();
     MetricsRecordBuilder rb = getMetrics(NS_METRICS);
     assertGauge("UnderReplicatedBlocks", 1L, rb);