You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ha...@apache.org on 2008/12/09 22:58:00 UTC

svn commit: r724907 - in /hadoop/core/branches/branch-0.18: CHANGES.txt src/hdfs/org/apache/hadoop/dfs/DataNode.java src/hdfs/org/apache/hadoop/dfs/FSDataset.java src/test/org/apache/hadoop/dfs/TestDiskError.java

Author: hairong
Date: Tue Dec  9 13:57:59 2008
New Revision: 724907

URL: http://svn.apache.org/viewvc?rev=724907&view=rev
Log:
Merge -r 724882:724883 from main to move the change log of HADOOP-4702 into release 0.18.

Modified:
    hadoop/core/branches/branch-0.18/CHANGES.txt
    hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/DataNode.java
    hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/FSDataset.java
    hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestDiskError.java

Modified: hadoop/core/branches/branch-0.18/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/CHANGES.txt?rev=724907&r1=724906&r2=724907&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.18/CHANGES.txt Tue Dec  9 13:57:59 2008
@@ -40,7 +40,7 @@
     HADOOP-4061. Throttle Datanode decommission monitoring in Namenode.
     (szetszwo)
 
-    HADOOP-4659. Root cause of connection failure is being ost to code that
+    HADOOP-4659. Root cause of connection failure is being lost to code that
     uses it for delaying startup. (Steve Loughran and Hairong via hairong)
 
     HADOOP-4614. Lazily open segments when merging map spills to avoid using
@@ -83,6 +83,9 @@
 
     HADOOP-4742. Replica gets deleted by mistake. (Wang Xu via hairong)
 
+    HADOOP-4702. Failed block replication leaves an incomplete block in
+    receiver's tmp data directory. (hairong)
+
 Release 0.18.2 - 2008-11-03
 
   BUG FIXES

Modified: hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/DataNode.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/DataNode.java?rev=724907&r1=724906&r2=724907&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/DataNode.java (original)
+++ hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/DataNode.java Tue Dec  9 13:57:59 2008
@@ -2364,10 +2364,10 @@
         this.isRecovery = isRecovery;
         this.clientName = clientName;
         this.offsetInBlock = 0;
+        this.srcDataNode = srcDataNode;
         this.checksum = DataChecksum.newDataChecksum(in);
         this.bytesPerChecksum = checksum.getBytesPerChecksum();
         this.checksumSize = checksum.getChecksumSize();
-        this.srcDataNode = srcDataNode;
         //
         // Open local disk out
         //
@@ -2381,6 +2381,9 @@
         }
       } catch(IOException ioe) {
         IOUtils.closeStream(this);
+        removeBlock();
+
+        // check if there is a disk error
         IOException cause = FSDataset.getCauseIfDiskError(ioe);
         if (cause != null) { // possible disk error
           ioe = cause;
@@ -2792,6 +2795,7 @@
         if (responder != null) {
           responder.interrupt();
         }
+        removeBlock();
         throw ioe;
       } finally {
         if (responder != null) {
@@ -2805,6 +2809,15 @@
       }
     }
 
+    /** Remove a partial block
+     * if this write is for a replication request (and not from a client)
+     */
+    private void removeBlock() throws IOException {
+      if (clientName.length() == 0) { // not client write
+        data.unfinalizeBlock(block);
+      }
+    }
+
     /**
      * Sets the file pointer in the local block file to the specified value.
      */

Modified: hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/FSDataset.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/FSDataset.java?rev=724907&r1=724906&r2=724907&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/FSDataset.java (original)
+++ hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/FSDataset.java Tue Dec  9 13:57:59 2008
@@ -1089,12 +1089,46 @@
    * Remove the temporary block file (if any)
    */
   public synchronized void unfinalizeBlock(Block b) throws IOException {
-    ongoingCreates.remove(b);
+    // remove the block from in-memory data structure
+    ActiveFile activefile = ongoingCreates.remove(b);
+    if (activefile == null) {
+      return;
+    }
     volumeMap.remove(b);
-    DataNode.LOG.warn("Block " + b + " unfinalized and removed. " );
+    
+    // delete the on-disk temp file
+    if (delBlockFromDisk(activefile.file, getMetaFile(activefile.file, b), b)) {
+      DataNode.LOG.warn("Block " + b + " unfinalized and removed. " );
+    }
   }
 
   /**
+   * Remove a block from disk
+   * @param blockFile block file
+   * @param metaFile block meta file
+   * @param b a block
+   * @return true if on-disk files are deleted; false otherwise
+   */
+  private boolean delBlockFromDisk(File blockFile, File metaFile, Block b) {
+    if (blockFile == null) {
+      DataNode.LOG.warn("No file exists for block: " + b);
+      return true;
+    }
+    
+    if (!blockFile.delete()) {
+      DataNode.LOG.warn("Not able to delete the block file: " + blockFile);
+      return false;
+    } else { // remove the meta file
+      if (metaFile != null && !metaFile.delete()) {
+        DataNode.LOG.warn(
+            "Not able to delete the meta block file: " + metaFile);
+        return false;
+      }
+    }
+    return true;
+  }
+  
+  /**
    * Return a table of block data
    */
   public Block[] getBlockReport() {

Modified: hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestDiskError.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestDiskError.java?rev=724907&r1=724906&r2=724907&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestDiskError.java (original)
+++ hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestDiskError.java Tue Dec  9 13:57:59 2008
@@ -17,17 +17,22 @@
  */
 package org.apache.hadoop.dfs;
 
+import java.io.DataOutputStream;
 import java.io.File;
+import java.net.InetSocketAddress;
+import java.net.Socket;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.dfs.DFSTestUtil;
+import org.apache.hadoop.dfs.FSConstants;
 import org.apache.hadoop.dfs.MiniDFSCluster;
+import org.apache.hadoop.io.Text;
 
 import junit.framework.TestCase;
 
-/** Test if a datanode can handle disk error correctly*/
+/** Test if a datanode can correctly handle errors during block read/write*/
 public class TestDiskError extends TestCase {
   public void testShutdown() throws Exception {
     // bring up a cluster of 3
@@ -54,6 +59,7 @@
         Path fileName = new Path("/test.txt"+i);
         DFSTestUtil.createFile(fs, fileName, 1024, (short)2, 1L);
         DFSTestUtil.waitReplication(fs, fileName, (short)2);
+        fs.delete(fileName, true);
       }
     } finally {
       // restore its old permission
@@ -62,4 +68,78 @@
       cluster.shutdown();
     }
   }
+  
+  public void testReplicationError() throws Exception {
+    // bring up a cluster of 1
+    Configuration conf = new Configuration();
+    MiniDFSCluster cluster = new MiniDFSCluster(conf, 1, true, null);
+    cluster.waitActive();
+    FileSystem fs = cluster.getFileSystem();
+    
+    try {
+      // create a file of replication factor of 1
+      final Path fileName = new Path("/test.txt");
+      final int fileLen = 1;
+      DFSTestUtil.createFile(fs, fileName, 1, (short)1, 1L);
+      DFSTestUtil.waitReplication(fs, fileName, (short)1);
+
+      // get the block belonged to the created file
+      LocatedBlocks blocks = cluster.getNameNode().namesystem.getBlockLocations(
+          fileName.toString(), 0, (long)fileLen);
+      assertEquals(blocks.locatedBlockCount(), 1);
+      LocatedBlock block = blocks.get(0);
+      
+      // bring up a second datanode
+      cluster.startDataNodes(conf, 1, true, null, null);
+      cluster.waitActive();
+      final int sndNode = 1;
+      DataNode datanode = cluster.getDataNodes().get(sndNode);
+      
+      // replicate the block to the second datanode
+      InetSocketAddress target = datanode.getSelfAddr();
+      Socket s = new Socket(target.getAddress(), target.getPort());
+        //write the header.
+      DataOutputStream out = new DataOutputStream(
+          s.getOutputStream());
+
+      out.writeShort( FSConstants.DATA_TRANSFER_VERSION );
+      out.write( FSConstants.OP_WRITE_BLOCK );
+      out.writeLong( block.getBlock().getBlockId());
+      out.writeLong( block.getBlock().getGenerationStamp() );
+      out.writeInt(1);
+      out.writeBoolean( false );       // recovery flag
+      Text.writeString( out, "" );
+      out.writeBoolean(false); // Not sending src node information
+      out.writeInt(0);
+      
+      // write check header
+      out.writeByte( 1 );
+      out.writeInt( 512 );
+
+      out.flush();
+
+      // close the connection before sending the content of the block
+      out.close();
+      
+      // the temporary block & meta files should be deleted
+      String dataDir = new File(
+         System.getProperty("test.build.data", "build/test/data"), 
+         "dfs").toString() + "/data";
+      File dir1 = new File(new File(dataDir, "data"+(2*sndNode+1)), "tmp");
+      File dir2 = new File(new File(dataDir, "data"+(2*sndNode+2)), "tmp");
+      while (dir1.listFiles().length != 0 || dir2.listFiles().length != 0) {
+        Thread.sleep(100);
+      }
+      
+      // then increase the file's replication factor
+      fs.setReplication(fileName, (short)2);
+      // replication should succeed
+      DFSTestUtil.waitReplication(fs, fileName, (short)1);
+      
+      // clean up the file
+      fs.delete(fileName, false);
+    } finally {
+      cluster.shutdown();
+    }
+  }
 }