You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by sz...@apache.org on 2008/10/17 20:14:05 UTC

svn commit: r705691 - in /hadoop/core/trunk: CHANGES.txt src/hdfs/org/apache/hadoop/hdfs/DFSClient.java

Author: szetszwo
Date: Fri Oct 17 11:14:05 2008
New Revision: 705691

URL: http://svn.apache.org/viewvc?rev=705691&view=rev
Log:
HADOOP-4278. If the primary datanode fails in DFSClent, remove it from the pipe line.  (dhruba via szetszwo)

Modified:
    hadoop/core/trunk/CHANGES.txt
    hadoop/core/trunk/src/hdfs/org/apache/hadoop/hdfs/DFSClient.java

Modified: hadoop/core/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/CHANGES.txt?rev=705691&r1=705690&r2=705691&view=diff
==============================================================================
--- hadoop/core/trunk/CHANGES.txt (original)
+++ hadoop/core/trunk/CHANGES.txt Fri Oct 17 11:14:05 2008
@@ -953,6 +953,9 @@
     HADOOP-4427. Adds the new queue/job commands to the manual.
     (Sreekanth Ramakrishnan via ddas)
 
+    HADOOP-4278. If the primary datanode fails in DFSClent, remove it from
+    the pipe line.  (dhruba via szetszwo)
+
 Release 0.18.2 - Unreleased
 
   BUG FIXES

Modified: hadoop/core/trunk/src/hdfs/org/apache/hadoop/hdfs/DFSClient.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/hdfs/org/apache/hadoop/hdfs/DFSClient.java?rev=705691&r1=705690&r2=705691&view=diff
==============================================================================
--- hadoop/core/trunk/src/hdfs/org/apache/hadoop/hdfs/DFSClient.java (original)
+++ hadoop/core/trunk/src/hdfs/org/apache/hadoop/hdfs/DFSClient.java Fri Oct 17 11:14:05 2008
@@ -2448,17 +2448,34 @@
         //
         LocatedBlock newBlock = null;
         ClientDatanodeProtocol primary =  null;
+        DatanodeInfo primaryNode = null;
         try {
           // Pick the "least" datanode as the primary datanode to avoid deadlock.
-          primary = createClientDatanodeProtocolProxy(
-              Collections.min(Arrays.asList(newnodes)), conf);
+          primaryNode = Collections.min(Arrays.asList(newnodes));
+          primary = createClientDatanodeProtocolProxy(primaryNode, conf);
           newBlock = primary.recoverBlock(block, newnodes);
         } catch (IOException e) {
           recoveryErrorCount++;
           if (recoveryErrorCount > maxRecoveryErrorCount) {
+            if (nodes.length > 1) {
+              // if the primary datanode failed, remove it from the list.
+              // The original bad datanode is left in the list because it is
+              // conservative to remove only one datanode in one iteration.
+              for (int j = 0; j < nodes.length; j++) {
+                if (nodes[j] ==  primaryNode) {
+                  errorIndex = j; // forget original bad node.
+                }
+              }
+              LOG.warn("Error Recovery for block " + block + " failed " +
+                       " because recovery from primary datanode " +
+                       primaryNode + " failed " + recoveryErrorCount +
+                       " times. Marking primary datanode as bad.");
+              recoveryErrorCount = 0; 
+              return true;          // sleep when we return from here
+            }
             String emsg = "Error Recovery for block " + block + " failed " +
                           " because recovery from primary datanode " +
-                          newnodes[0] + " failed " + recoveryErrorCount + 
+                          primaryNode + " failed " + recoveryErrorCount + 
                           " times. Aborting...";
             LOG.warn(emsg);
             lastException = new IOException(emsg);
@@ -2468,7 +2485,7 @@
           } 
           LOG.warn("Error Recovery for block " + block + " failed " +
                    " because recovery from primary datanode " +
-                   newnodes[0] + " failed " + recoveryErrorCount +
+                   primaryNode + " failed " + recoveryErrorCount +
                    " times. Will retry...");
           return true;          // sleep when we return from here
         } finally {