You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hdfs-commits@hadoop.apache.org by ji...@apache.org on 2014/08/06 21:03:08 UTC
svn commit: r1616306 - in
/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs: ./
src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/
src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/
src/test/java/org/apache/hadoop/hdfs/server/nam...
Author: jing9
Date: Wed Aug 6 19:03:07 2014
New Revision: 1616306
URL: http://svn.apache.org/r1616306
Log:
HDFS-6791. A block could remain under replicated if all of its replicas are on decommissioned nodes. Contributed by Ming Ma.
Modified:
hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java
hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDecommissioningStatus.java
Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt?rev=1616306&r1=1616305&r2=1616306&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt Wed Aug 6 19:03:07 2014
@@ -449,6 +449,9 @@ Release 2.6.0 - UNRELEASED
HDFS-6790. DFSUtil Should Use configuration.getPassword for SSL passwords
(Larry McCay via brandonli)
+ HDFS-6791. A block could remain under replicated if all of its replicas are on
+ decommissioned nodes. (Ming Ma via jing9)
+
Release 2.5.0 - UNRELEASED
INCOMPATIBLE CHANGES
Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java?rev=1616306&r1=1616305&r2=1616306&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java Wed Aug 6 19:03:07 2014
@@ -3174,6 +3174,15 @@ public class BlockManager {
}
}
}
+
+ if (!status && !srcNode.isAlive) {
+ LOG.warn("srcNode " + srcNode + " is dead " +
+ "when decommission is in progress. Continue to mark " +
+ "it as decommission in progress. In that way, when it rejoins the " +
+ "cluster it can continue the decommission process.");
+ status = true;
+ }
+
srcNode.decommissioningStatus.set(underReplicatedBlocks,
decommissionOnlyReplicas,
underReplicatedInOpenFiles);
Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java?rev=1616306&r1=1616305&r2=1616306&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java Wed Aug 6 19:03:07 2014
@@ -268,4 +268,14 @@ public class BlockManagerTestUtil {
}
return reports.toArray(StorageReport.EMPTY_ARRAY);
}
+
+ /**
+ * Have DatanodeManager check decommission state.
+ * @param dm the DatanodeManager to manipulate
+ */
+ public static void checkDecommissionState(DatanodeManager dm,
+ DatanodeDescriptor node) {
+ dm.checkDecommissionState(node);
+ }
+
}
Modified: hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDecommissioningStatus.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDecommissioningStatus.java?rev=1616306&r1=1616305&r2=1616306&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDecommissioningStatus.java (original)
+++ hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDecommissioningStatus.java Wed Aug 6 19:03:07 2014
@@ -31,19 +31,24 @@ import java.util.Random;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSCluster.DataNodeProperties;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
import org.apache.hadoop.hdfs.tools.DFSAdmin;
@@ -89,6 +94,8 @@ public class TestDecommissioningStatus {
4);
conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, 1000);
conf.setInt(DFSConfigKeys.DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY, 1);
+ conf.setLong(DFSConfigKeys.DFS_DATANODE_BALANCE_BANDWIDTHPERSEC_KEY, 1);
+
writeConfigFile(localFileSys, excludeFile, null);
writeConfigFile(localFileSys, includeFile, null);
@@ -99,6 +106,7 @@ public class TestDecommissioningStatus {
@AfterClass
public static void tearDown() throws Exception {
+ if (localFileSys != null ) cleanupFile(localFileSys, dir);
if(fileSys != null) fileSys.close();
if(cluster != null) cluster.shutdown();
}
@@ -138,7 +146,8 @@ public class TestDecommissioningStatus {
return stm;
}
- private void cleanupFile(FileSystem fileSys, Path name) throws IOException {
+ static private void cleanupFile(FileSystem fileSys, Path name)
+ throws IOException {
assertTrue(fileSys.exists(name));
fileSys.delete(name, true);
assertTrue(!fileSys.exists(name));
@@ -147,19 +156,26 @@ public class TestDecommissioningStatus {
/*
* Decommissions the node at the given index
*/
- private String decommissionNode(FSNamesystem namesystem,
- DFSClient client, FileSystem localFileSys, int nodeIndex)
- throws IOException {
+ private String decommissionNode(FSNamesystem namesystem, DFSClient client,
+ FileSystem localFileSys, int nodeIndex) throws IOException {
DatanodeInfo[] info = client.datanodeReport(DatanodeReportType.LIVE);
String nodename = info[nodeIndex].getXferAddr();
- System.out.println("Decommissioning node: " + nodename);
+ decommissionNode(namesystem, localFileSys, nodename);
+ return nodename;
+ }
+
+ /*
+ * Decommissions the node by name
+ */
+ private void decommissionNode(FSNamesystem namesystem,
+ FileSystem localFileSys, String dnName) throws IOException {
+ System.out.println("Decommissioning node: " + dnName);
// write nodename into the exclude file.
ArrayList<String> nodes = new ArrayList<String>(decommissionedNodes);
- nodes.add(nodename);
+ nodes.add(dnName);
writeConfigFile(localFileSys, excludeFile, nodes);
- return nodename;
}
private void checkDecommissionStatus(DatanodeDescriptor decommNode,
@@ -276,6 +292,69 @@ public class TestDecommissioningStatus {
st1.close();
cleanupFile(fileSys, file1);
cleanupFile(fileSys, file2);
- cleanupFile(localFileSys, dir);
+ }
+
+ /**
+ * Verify a DN remains in DECOMMISSION_INPROGRESS state if it is marked
+ * as dead before decommission has completed. That will allow DN to resume
+ * the replication process after it rejoins the cluster.
+ */
+ @Test(timeout=120000)
+ public void testDecommissionStatusAfterDNRestart()
+ throws IOException, InterruptedException {
+ DistributedFileSystem fileSys =
+ (DistributedFileSystem)cluster.getFileSystem();
+
+ // Create a file with one block. That block has one replica.
+ Path f = new Path("decommission.dat");
+ DFSTestUtil.createFile(fileSys, f, fileSize, fileSize, fileSize,
+ (short)1, seed);
+
+ // Find the DN that owns the only replica.
+ RemoteIterator<LocatedFileStatus> fileList = fileSys.listLocatedStatus(f);
+ BlockLocation[] blockLocations = fileList.next().getBlockLocations();
+ String dnName = blockLocations[0].getNames()[0];
+
+ // Decommission the DN.
+ FSNamesystem fsn = cluster.getNamesystem();
+ final DatanodeManager dm = fsn.getBlockManager().getDatanodeManager();
+ decommissionNode(fsn, localFileSys, dnName);
+ dm.refreshNodes(conf);
+
+ // Stop the DN when decommission is in progress.
+ // Given DFS_DATANODE_BALANCE_BANDWIDTHPERSEC_KEY is to 1 and the size of
+ // the block, it will take much longer time that test timeout value for
+ // the decommission to complete. So when stopDataNode is called,
+ // decommission should be in progress.
+ DataNodeProperties dataNodeProperties = cluster.stopDataNode(dnName);
+ final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
+ while (true) {
+ dm.fetchDatanodes(null, dead, false);
+ if (dead.size() == 1) {
+ break;
+ }
+ Thread.sleep(1000);
+ }
+
+ // Force removal of the dead node's blocks.
+ BlockManagerTestUtil.checkHeartbeat(fsn.getBlockManager());
+
+ // Force DatanodeManager to check decommission state.
+ BlockManagerTestUtil.checkDecommissionState(dm, dead.get(0));
+
+ // Verify that the DN remains in DECOMMISSION_INPROGRESS state.
+ assertTrue("the node is in decommissioned state ",
+ !dead.get(0).isDecommissioned());
+
+ // Add the node back
+ cluster.restartDataNode(dataNodeProperties, true);
+ cluster.waitActive();
+
+ // Call refreshNodes on FSNamesystem with empty exclude file.
+ // This will remove the datanodes from decommissioning list and
+ // make them available again.
+ writeConfigFile(localFileSys, excludeFile, null);
+ dm.refreshNodes(conf);
+ cleanupFile(fileSys, f);
}
}