You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ki...@apache.org on 2015/09/28 21:29:36 UTC

hadoop git commit: HDFS-9106. Transfer failure during pipeline recovery causes permanent write failures. Contributed by Kihwal Lee.

Repository: hadoop
Updated Branches:
  refs/heads/trunk fb2e525c0 -> 4c9497cbf


HDFS-9106. Transfer failure during pipeline recovery causes permanent write failures. Contributed by Kihwal Lee.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/4c9497cb
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/4c9497cb
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/4c9497cb

Branch: refs/heads/trunk
Commit: 4c9497cbf02ecc82532a4e79e18912d8e0eb4731
Parents: fb2e525
Author: Kihwal Lee <ki...@apache.org>
Authored: Mon Sep 28 13:29:19 2015 -0500
Committer: Kihwal Lee <ki...@apache.org>
Committed: Mon Sep 28 13:29:56 2015 -0500

----------------------------------------------------------------------
 .../org/apache/hadoop/hdfs/DataStreamer.java    | 60 ++++++++++++++------
 hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt     |  3 +
 2 files changed, 47 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/4c9497cb/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java
index 6482966..d1d8d37 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java
@@ -1208,22 +1208,46 @@ class DataStreamer extends Daemon {
       return;
     }
 
-    //get a new datanode
+    int tried = 0;
     final DatanodeInfo[] original = nodes;
-    final LocatedBlock lb = dfsClient.namenode.getAdditionalDatanode(
-        src, stat.getFileId(), block, nodes, storageIDs,
-        failed.toArray(new DatanodeInfo[failed.size()]),
-        1, dfsClient.clientName);
-    setPipeline(lb);
-
-    //find the new datanode
-    final int d = findNewDatanode(original);
-
-    //transfer replica
-    final DatanodeInfo src = d == 0? nodes[1]: nodes[d - 1];
-    final DatanodeInfo[] targets = {nodes[d]};
-    final StorageType[] targetStorageTypes = {storageTypes[d]};
-    transfer(src, targets, targetStorageTypes, lb.getBlockToken());
+    final StorageType[] originalTypes = storageTypes;
+    final String[] originalIDs = storageIDs;
+    IOException caughtException = null;
+    ArrayList<DatanodeInfo> exclude = new ArrayList<DatanodeInfo>(failed);
+    while (tried < 3) {
+      LocatedBlock lb;
+      //get a new datanode
+      lb = dfsClient.namenode.getAdditionalDatanode(
+          src, stat.getFileId(), block, nodes, storageIDs,
+          exclude.toArray(new DatanodeInfo[exclude.size()]),
+          1, dfsClient.clientName);
+      // a new node was allocated by the namenode. Update nodes.
+      setPipeline(lb);
+
+      //find the new datanode
+      final int d = findNewDatanode(original);
+      //transfer replica. pick a source from the original nodes
+      final DatanodeInfo src = original[tried % original.length];
+      final DatanodeInfo[] targets = {nodes[d]};
+      final StorageType[] targetStorageTypes = {storageTypes[d]};
+
+      try {
+        transfer(src, targets, targetStorageTypes, lb.getBlockToken());
+      } catch (IOException ioe) {
+        DFSClient.LOG.warn("Error transferring data from " + src + " to " +
+            nodes[d] + ": " + ioe.getMessage());
+        caughtException = ioe;
+        // add the allocated node to the exclude list.
+        exclude.add(nodes[d]);
+        setPipeline(original, originalTypes, originalIDs);
+        tried++;
+        continue;
+      }
+      return; // finished successfully
+    }
+    // All retries failed
+    throw (caughtException != null) ? caughtException :
+        new IOException("Failed to add a node");
   }
 
   private void transfer(final DatanodeInfo src, final DatanodeInfo[] targets,
@@ -1236,7 +1260,11 @@ class DataStreamer extends Daemon {
     try {
       sock = createSocketForPipeline(src, 2, dfsClient);
       final long writeTimeout = dfsClient.getDatanodeWriteTimeout(2);
-      final long readTimeout = dfsClient.getDatanodeReadTimeout(2);
+
+      // transfer timeout multiplier based on the transfer size
+      // One per 200 packets = 12.8MB. Minimum is 2.
+      int multi = 2 + (int)(bytesSent/dfsClient.getConf().getWritePacketSize())/200;
+      final long readTimeout = dfsClient.getDatanodeReadTimeout(multi);
 
       OutputStream unbufOut = NetUtils.getOutputStream(sock, writeTimeout);
       InputStream unbufIn = NetUtils.getInputStream(sock, readTimeout);

http://git-wip-us.apache.org/repos/asf/hadoop/blob/4c9497cb/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
index 3571e4a..1d9fa1d 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@@ -1488,6 +1488,9 @@ Release 2.7.2 - UNRELEASED
     HDFS-9043. Doc updation for commands in HDFS Federation
     (J.Andreina via vinayakumab)
 
+    HDFS-9106. Transfer failure during pipeline recovery causes permanent
+    write failures (kihwal)
+
 Release 2.7.1 - 2015-07-06
 
   INCOMPATIBLE CHANGES