You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by sj...@apache.org on 2016/10/18 23:45:22 UTC
[36/50] [abbrv] hadoop git commit: HDFS-9390. Block management for
maintenance states.
http://git-wip-us.apache.org/repos/asf/hadoop/blob/b61fb267/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMaintenanceState.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMaintenanceState.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMaintenanceState.java
index 63617ad..c125f45 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMaintenanceState.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMaintenanceState.java
@@ -18,13 +18,19 @@
package org.apache.hadoop.hdfs;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
@@ -32,6 +38,8 @@ import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo.AdminStates;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
+import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.util.Time;
import org.junit.Test;
@@ -40,13 +48,23 @@ import org.junit.Test;
* This class tests node maintenance.
*/
public class TestMaintenanceState extends AdminStatesBaseTest {
- public static final Log LOG = LogFactory.getLog(TestMaintenanceState.class);
- static private final long EXPIRATION_IN_MS = 500;
+ public static final Logger LOG =
+ LoggerFactory.getLogger(TestMaintenanceState.class);
+ static private final long EXPIRATION_IN_MS = 50;
+ private int minMaintenanceR =
+ DFSConfigKeys.DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_DEFAULT;
public TestMaintenanceState() {
setUseCombinedHostFileManager();
}
+ void setMinMaintenanceR(int minMaintenanceR) {
+ this.minMaintenanceR = minMaintenanceR;
+ getConf().setInt(
+ DFSConfigKeys.DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY,
+ minMaintenanceR);
+ }
+
/**
* Verify a node can transition from AdminStates.ENTERING_MAINTENANCE to
* AdminStates.NORMAL.
@@ -55,21 +73,25 @@ public class TestMaintenanceState extends AdminStatesBaseTest {
public void testTakeNodeOutOfEnteringMaintenance() throws Exception {
LOG.info("Starting testTakeNodeOutOfEnteringMaintenance");
final int replicas = 1;
- final int numNamenodes = 1;
- final int numDatanodes = 1;
- final Path file1 = new Path("/testTakeNodeOutOfEnteringMaintenance.dat");
+ final Path file = new Path("/testTakeNodeOutOfEnteringMaintenance.dat");
- startCluster(numNamenodes, numDatanodes);
+ startCluster(1, 1);
- FileSystem fileSys = getCluster().getFileSystem(0);
- writeFile(fileSys, file1, replicas, 1);
+ final FileSystem fileSys = getCluster().getFileSystem(0);
+ final FSNamesystem ns = getCluster().getNamesystem(0);
+ writeFile(fileSys, file, replicas, 1);
- DatanodeInfo nodeOutofService = takeNodeOutofService(0,
+ final DatanodeInfo nodeOutofService = takeNodeOutofService(0,
null, Long.MAX_VALUE, null, AdminStates.ENTERING_MAINTENANCE);
+ // When node is in ENTERING_MAINTENANCE state, it can still serve read
+ // requests
+ assertNull(checkWithRetry(ns, fileSys, file, replicas, null,
+ nodeOutofService));
+
putNodeInService(0, nodeOutofService.getDatanodeUuid());
- cleanupFile(fileSys, file1);
+ cleanupFile(fileSys, file);
}
/**
@@ -80,23 +102,21 @@ public class TestMaintenanceState extends AdminStatesBaseTest {
public void testEnteringMaintenanceExpiration() throws Exception {
LOG.info("Starting testEnteringMaintenanceExpiration");
final int replicas = 1;
- final int numNamenodes = 1;
- final int numDatanodes = 1;
- final Path file1 = new Path("/testTakeNodeOutOfEnteringMaintenance.dat");
+ final Path file = new Path("/testEnteringMaintenanceExpiration.dat");
- startCluster(numNamenodes, numDatanodes);
+ startCluster(1, 1);
- FileSystem fileSys = getCluster().getFileSystem(0);
- writeFile(fileSys, file1, replicas, 1);
+ final FileSystem fileSys = getCluster().getFileSystem(0);
+ writeFile(fileSys, file, replicas, 1);
- // expires in 500 milliseconds
- DatanodeInfo nodeOutofService = takeNodeOutofService(0, null,
- Time.monotonicNow() + EXPIRATION_IN_MS, null,
- AdminStates.ENTERING_MAINTENANCE);
+ final DatanodeInfo nodeOutofService = takeNodeOutofService(0, null,
+ Long.MAX_VALUE, null, AdminStates.ENTERING_MAINTENANCE);
- waitNodeState(nodeOutofService, AdminStates.NORMAL);
+ // Adjust the expiration.
+ takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(),
+ Time.monotonicNow() + EXPIRATION_IN_MS, null, AdminStates.NORMAL);
- cleanupFile(fileSys, file1);
+ cleanupFile(fileSys, file);
}
/**
@@ -106,20 +126,18 @@ public class TestMaintenanceState extends AdminStatesBaseTest {
public void testInvalidExpiration() throws Exception {
LOG.info("Starting testInvalidExpiration");
final int replicas = 1;
- final int numNamenodes = 1;
- final int numDatanodes = 1;
- final Path file1 = new Path("/testTakeNodeOutOfEnteringMaintenance.dat");
+ final Path file = new Path("/testInvalidExpiration.dat");
- startCluster(numNamenodes, numDatanodes);
+ startCluster(1, 1);
- FileSystem fileSys = getCluster().getFileSystem(0);
- writeFile(fileSys, file1, replicas, 1);
+ final FileSystem fileSys = getCluster().getFileSystem(0);
+ writeFile(fileSys, file, replicas, 1);
// expiration has to be greater than Time.monotonicNow().
takeNodeOutofService(0, null, Time.monotonicNow(), null,
AdminStates.NORMAL);
- cleanupFile(fileSys, file1);
+ cleanupFile(fileSys, file);
}
/**
@@ -129,18 +147,17 @@ public class TestMaintenanceState extends AdminStatesBaseTest {
@Test(timeout = 360000)
public void testPutDeadNodeToMaintenance() throws Exception {
LOG.info("Starting testPutDeadNodeToMaintenance");
- final int numNamenodes = 1;
- final int numDatanodes = 1;
final int replicas = 1;
- final Path file1 = new Path("/testPutDeadNodeToMaintenance.dat");
+ final Path file = new Path("/testPutDeadNodeToMaintenance.dat");
- startCluster(numNamenodes, numDatanodes);
+ startCluster(1, 1);
- FileSystem fileSys = getCluster().getFileSystem(0);
- FSNamesystem ns = getCluster().getNamesystem(0);
- writeFile(fileSys, file1, replicas, 1);
+ final FileSystem fileSys = getCluster().getFileSystem(0);
+ final FSNamesystem ns = getCluster().getNamesystem(0);
+ writeFile(fileSys, file, replicas, 1);
- MiniDFSCluster.DataNodeProperties dnProp = getCluster().stopDataNode(0);
+ final MiniDFSCluster.DataNodeProperties dnProp =
+ getCluster().stopDataNode(0);
DFSTestUtil.waitForDatanodeState(
getCluster(), dnProp.datanode.getDatanodeUuid(), false, 20000);
@@ -153,7 +170,7 @@ public class TestMaintenanceState extends AdminStatesBaseTest {
assertEquals(deadInMaintenance + 1, ns.getNumInMaintenanceDeadDataNodes());
assertEquals(liveInMaintenance, ns.getNumInMaintenanceLiveDataNodes());
- cleanupFile(fileSys, file1);
+ cleanupFile(fileSys, file);
}
/**
@@ -164,16 +181,14 @@ public class TestMaintenanceState extends AdminStatesBaseTest {
@Test(timeout = 360000)
public void testPutDeadNodeToMaintenanceWithExpiration() throws Exception {
LOG.info("Starting testPutDeadNodeToMaintenanceWithExpiration");
- final int numNamenodes = 1;
- final int numDatanodes = 1;
- final int replicas = 1;
- final Path file1 = new Path("/testPutDeadNodeToMaintenance.dat");
+ final Path file =
+ new Path("/testPutDeadNodeToMaintenanceWithExpiration.dat");
- startCluster(numNamenodes, numDatanodes);
+ startCluster(1, 1);
FileSystem fileSys = getCluster().getFileSystem(0);
FSNamesystem ns = getCluster().getNamesystem(0);
- writeFile(fileSys, file1, replicas, 1);
+ writeFile(fileSys, file, 1, 1);
MiniDFSCluster.DataNodeProperties dnProp = getCluster().stopDataNode(0);
DFSTestUtil.waitForDatanodeState(
@@ -184,16 +199,17 @@ public class TestMaintenanceState extends AdminStatesBaseTest {
DatanodeInfo nodeOutofService = takeNodeOutofService(0,
dnProp.datanode.getDatanodeUuid(),
- Time.monotonicNow() + EXPIRATION_IN_MS, null,
- AdminStates.IN_MAINTENANCE);
+ Long.MAX_VALUE, null, AdminStates.IN_MAINTENANCE);
- waitNodeState(nodeOutofService, AdminStates.NORMAL);
+ // Adjust the expiration.
+ takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(),
+ Time.monotonicNow() + EXPIRATION_IN_MS, null, AdminStates.NORMAL);
// no change
assertEquals(deadInMaintenance, ns.getNumInMaintenanceDeadDataNodes());
assertEquals(liveInMaintenance, ns.getNumInMaintenanceLiveDataNodes());
- cleanupFile(fileSys, file1);
+ cleanupFile(fileSys, file);
}
/**
@@ -202,15 +218,12 @@ public class TestMaintenanceState extends AdminStatesBaseTest {
@Test(timeout = 360000)
public void testTransitionFromDecommissioned() throws IOException {
LOG.info("Starting testTransitionFromDecommissioned");
- final int numNamenodes = 1;
- final int numDatanodes = 4;
- final int replicas = 3;
- final Path file1 = new Path("/testTransitionFromDecommissioned.dat");
+ final Path file = new Path("/testTransitionFromDecommissioned.dat");
- startCluster(numNamenodes, numDatanodes);
+ startCluster(1, 4);
- FileSystem fileSys = getCluster().getFileSystem(0);
- writeFile(fileSys, file1, replicas, 1);
+ final FileSystem fileSys = getCluster().getFileSystem(0);
+ writeFile(fileSys, file, 3, 1);
DatanodeInfo nodeOutofService = takeNodeOutofService(0, null, 0, null,
AdminStates.DECOMMISSIONED);
@@ -218,7 +231,7 @@ public class TestMaintenanceState extends AdminStatesBaseTest {
takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(), Long.MAX_VALUE,
null, AdminStates.IN_MAINTENANCE);
- cleanupFile(fileSys, file1);
+ cleanupFile(fileSys, file);
}
/**
@@ -228,34 +241,33 @@ public class TestMaintenanceState extends AdminStatesBaseTest {
@Test(timeout = 360000)
public void testTransitionFromDecommissionedAndExpired() throws IOException {
LOG.info("Starting testTransitionFromDecommissionedAndExpired");
- final int numNamenodes = 1;
- final int numDatanodes = 4;
- final int replicas = 3;
- final Path file1 = new Path("/testTransitionFromDecommissioned.dat");
+ final Path file =
+ new Path("/testTransitionFromDecommissionedAndExpired.dat");
- startCluster(numNamenodes, numDatanodes);
+ startCluster(1, 4);
- FileSystem fileSys = getCluster().getFileSystem(0);
- writeFile(fileSys, file1, replicas, 1);
+ final FileSystem fileSys = getCluster().getFileSystem(0);
+ writeFile(fileSys, file, 3, 1);
- DatanodeInfo nodeOutofService = takeNodeOutofService(0, null, 0, null,
- AdminStates.DECOMMISSIONED);
+ final DatanodeInfo nodeOutofService = takeNodeOutofService(0, null, 0,
+ null, AdminStates.DECOMMISSIONED);
takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(),
- Time.monotonicNow() + EXPIRATION_IN_MS, null,
- AdminStates.IN_MAINTENANCE);
+ Long.MAX_VALUE, null, AdminStates.IN_MAINTENANCE);
- waitNodeState(nodeOutofService, AdminStates.NORMAL);
+ // Adjust the expiration.
+ takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(),
+ Time.monotonicNow() + EXPIRATION_IN_MS, null, AdminStates.NORMAL);
- cleanupFile(fileSys, file1);
+ cleanupFile(fileSys, file);
}
/**
* When a node is put to maintenance, it first transitions to
* AdminStates.ENTERING_MAINTENANCE. It makes sure all blocks have minimal
* replication before it can be transitioned to AdminStates.IN_MAINTENANCE.
- * If node becomes dead when it is in AdminStates.ENTERING_MAINTENANCE, admin
- * state should stay in AdminStates.ENTERING_MAINTENANCE state.
+ * If node becomes dead when it is in AdminStates.ENTERING_MAINTENANCE, it
+ * should stay in AdminStates.ENTERING_MAINTENANCE state.
*/
@Test(timeout = 360000)
public void testNodeDeadWhenInEnteringMaintenance() throws Exception {
@@ -263,16 +275,16 @@ public class TestMaintenanceState extends AdminStatesBaseTest {
final int numNamenodes = 1;
final int numDatanodes = 1;
final int replicas = 1;
- final Path file1 = new Path("/testNodeDeadWhenInEnteringMaintenance.dat");
+ final Path file = new Path("/testNodeDeadWhenInEnteringMaintenance.dat");
startCluster(numNamenodes, numDatanodes);
- FileSystem fileSys = getCluster().getFileSystem(0);
- FSNamesystem ns = getCluster().getNamesystem(0);
- writeFile(fileSys, file1, replicas, 1);
+ final FileSystem fileSys = getCluster().getFileSystem(0);
+ final FSNamesystem ns = getCluster().getNamesystem(0);
+ writeFile(fileSys, file, replicas, 1);
DatanodeInfo nodeOutofService = takeNodeOutofService(0,
- getFirstBlockFirstReplicaUuid(fileSys, file1), Long.MAX_VALUE, null,
+ getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null,
AdminStates.ENTERING_MAINTENANCE);
assertEquals(1, ns.getNumEnteringMaintenanceDataNodes());
@@ -281,30 +293,627 @@ public class TestMaintenanceState extends AdminStatesBaseTest {
DFSTestUtil.waitForDatanodeState(
getCluster(), nodeOutofService.getDatanodeUuid(), false, 20000);
DFSClient client = getDfsClient(0);
- assertEquals("maintenance node shouldn't be alive", numDatanodes - 1,
+ assertEquals("maintenance node shouldn't be live", numDatanodes - 1,
client.datanodeReport(DatanodeReportType.LIVE).length);
+ assertEquals(1, ns.getNumEnteringMaintenanceDataNodes());
getCluster().restartDataNode(dnProp, true);
getCluster().waitActive();
waitNodeState(nodeOutofService, AdminStates.ENTERING_MAINTENANCE);
assertEquals(1, ns.getNumEnteringMaintenanceDataNodes());
+ assertEquals("maintenance node should be live", numDatanodes,
+ client.datanodeReport(DatanodeReportType.LIVE).length);
+
+ cleanupFile(fileSys, file);
+ }
+
+ /**
+ * When a node is put to maintenance, it first transitions to
+ * AdminStates.ENTERING_MAINTENANCE. It makes sure all blocks have
+ * been properly replicated before it can be transitioned to
+ * AdminStates.IN_MAINTENANCE. The expected replication count takes
+ * DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY and
+ * its file's replication factor into account.
+ */
+ @Test(timeout = 360000)
+ public void testExpectedReplications() throws IOException {
+ LOG.info("Starting testExpectedReplications");
+ testExpectedReplication(1);
+ testExpectedReplication(2);
+ testExpectedReplication(3);
+ testExpectedReplication(4);
+ }
+
+ private void testExpectedReplication(int replicationFactor)
+ throws IOException {
+ testExpectedReplication(replicationFactor,
+ Math.max(replicationFactor - 1, this.minMaintenanceR));
+ }
+
+ private void testExpectedReplication(int replicationFactor,
+ int expectedReplicasInRead) throws IOException {
+ startCluster(1, 5);
+
+ final Path file = new Path("/testExpectedReplication.dat");
+
+ final FileSystem fileSys = getCluster().getFileSystem(0);
+ final FSNamesystem ns = getCluster().getNamesystem(0);
+
+ writeFile(fileSys, file, replicationFactor, 1);
+
+ DatanodeInfo nodeOutofService = takeNodeOutofService(0,
+ getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE,
+ null, AdminStates.IN_MAINTENANCE);
+
+ // The block should be replicated to another datanode to meet
+ // expected replication count.
+ assertNull(checkWithRetry(ns, fileSys, file, expectedReplicasInRead,
+ nodeOutofService));
+
+ cleanupFile(fileSys, file);
+ }
+
+ /**
+ * Verify a node can transition directly to AdminStates.IN_MAINTENANCE when
+ * DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY is set to zero.
+ */
+ @Test(timeout = 360000)
+ public void testZeroMinMaintenanceReplication() throws Exception {
+ LOG.info("Starting testZeroMinMaintenanceReplication");
+ setMinMaintenanceR(0);
+ startCluster(1, 1);
+
+ final Path file = new Path("/testZeroMinMaintenanceReplication.dat");
+ final int replicas = 1;
+
+ FileSystem fileSys = getCluster().getFileSystem(0);
+ writeFile(fileSys, file, replicas, 1);
+
+ takeNodeOutofService(0, null, Long.MAX_VALUE, null,
+ AdminStates.IN_MAINTENANCE);
+
+ cleanupFile(fileSys, file);
+ }
+
+ /**
+ * Verify a node can transition directly to AdminStates.IN_MAINTENANCE when
+ * DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY is set to zero. Then later
+ * transition to NORMAL after maintenance expiration.
+ */
+ @Test(timeout = 360000)
+ public void testZeroMinMaintenanceReplicationWithExpiration()
+ throws Exception {
+ LOG.info("Starting testZeroMinMaintenanceReplicationWithExpiration");
+ setMinMaintenanceR(0);
+ startCluster(1, 1);
+
+ final Path file =
+ new Path("/testZeroMinMaintenanceReplicationWithExpiration.dat");
+
+ FileSystem fileSys = getCluster().getFileSystem(0);
+ writeFile(fileSys, file, 1, 1);
+
+ DatanodeInfo nodeOutofService = takeNodeOutofService(0, null,
+ Long.MAX_VALUE, null, AdminStates.IN_MAINTENANCE);
+
+ // Adjust the expiration.
+ takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(),
+ Time.monotonicNow() + EXPIRATION_IN_MS, null, AdminStates.NORMAL);
+
+ cleanupFile(fileSys, file);
+ }
+
+ /**
+ * Transition from IN_MAINTENANCE to DECOMMISSIONED.
+ */
+ @Test(timeout = 360000)
+ public void testTransitionToDecommission() throws IOException {
+ LOG.info("Starting testTransitionToDecommission");
+ final int numNamenodes = 1;
+ final int numDatanodes = 4;
+ startCluster(numNamenodes, numDatanodes);
+
+ final Path file = new Path("testTransitionToDecommission.dat");
+ final int replicas = 3;
+
+ FileSystem fileSys = getCluster().getFileSystem(0);
+ FSNamesystem ns = getCluster().getNamesystem(0);
+
+ writeFile(fileSys, file, replicas, 1);
+
+ DatanodeInfo nodeOutofService = takeNodeOutofService(0,
+ getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null,
+ AdminStates.IN_MAINTENANCE);
+
+ DFSClient client = getDfsClient(0);
+ assertEquals("All datanodes must be alive", numDatanodes,
+ client.datanodeReport(DatanodeReportType.LIVE).length);
+
+ // test 1, verify the replica in IN_MAINTENANCE state isn't in LocatedBlock
+ assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
+ nodeOutofService));
+
+ takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(), 0, null,
+ AdminStates.DECOMMISSIONED);
+
+ // test 2 after decommission has completed, the replication count is
+ // replicas + 1 which includes the decommissioned node.
+ assertNull(checkWithRetry(ns, fileSys, file, replicas + 1, null));
+
+ // test 3, put the node in service, replication count should restore.
+ putNodeInService(0, nodeOutofService.getDatanodeUuid());
+ assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
+
+ cleanupFile(fileSys, file);
+ }
+
+ /**
+ * Transition from decommissioning state to maintenance state.
+ */
+ @Test(timeout = 360000)
+ public void testTransitionFromDecommissioning() throws IOException {
+ LOG.info("Starting testTransitionFromDecommissioning");
+ startCluster(1, 3);
+
+ final Path file = new Path("/testTransitionFromDecommissioning.dat");
+ final int replicas = 3;
+
+ final FileSystem fileSys = getCluster().getFileSystem(0);
+ final FSNamesystem ns = getCluster().getNamesystem(0);
+
+ writeFile(fileSys, file, replicas);
+
+ final DatanodeInfo nodeOutofService = takeNodeOutofService(0, null, 0,
+ null, AdminStates.DECOMMISSION_INPROGRESS);
+
+ takeNodeOutofService(0, nodeOutofService.getDatanodeUuid(), Long.MAX_VALUE,
+ null, AdminStates.IN_MAINTENANCE);
+
+ assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
+ nodeOutofService));
+
+ cleanupFile(fileSys, file);
+ }
+
+
+ /**
+ * First put a node in maintenance, then put a different node
+ * in decommission. Make sure decommission process take
+ * maintenance replica into account.
+ */
+ @Test(timeout = 360000)
+ public void testDecommissionDifferentNodeAfterMaintenances()
+ throws Exception {
+ testDecommissionDifferentNodeAfterMaintenance(2);
+ testDecommissionDifferentNodeAfterMaintenance(3);
+ testDecommissionDifferentNodeAfterMaintenance(4);
+ }
+
+ private void testDecommissionDifferentNodeAfterMaintenance(int repl)
+ throws Exception {
+ startCluster(1, 5);
+
+ final Path file =
+ new Path("/testDecommissionDifferentNodeAfterMaintenance.dat");
+
+ final FileSystem fileSys = getCluster().getFileSystem(0);
+ final FSNamesystem ns = getCluster().getNamesystem(0);
+
+ writeFile(fileSys, file, repl, 1);
+ final DatanodeInfo[] nodes = getFirstBlockReplicasDatanodeInfos(fileSys,
+ file);
+ String maintenanceDNUuid = nodes[0].getDatanodeUuid();
+ String decommissionDNUuid = nodes[1].getDatanodeUuid();
+ DatanodeInfo maintenanceDN = takeNodeOutofService(0, maintenanceDNUuid,
+ Long.MAX_VALUE, null, null, AdminStates.IN_MAINTENANCE);
- cleanupFile(fileSys, file1);
+ Map<DatanodeInfo, Long> maintenanceNodes = new HashMap<>();
+ maintenanceNodes.put(nodes[0], Long.MAX_VALUE);
+ takeNodeOutofService(0, decommissionDNUuid, 0, null, maintenanceNodes,
+ AdminStates.DECOMMISSIONED);
+ // Out of the replicas returned, one is the decommissioned node.
+ assertNull(checkWithRetry(ns, fileSys, file, repl, maintenanceDN));
+
+ putNodeInService(0, maintenanceDN);
+ assertNull(checkWithRetry(ns, fileSys, file, repl + 1, null));
+
+ cleanupFile(fileSys, file);
+ }
+
+
+ @Test(timeout = 360000)
+ public void testChangeReplicationFactors() throws IOException {
+ // Prior to any change, there is 1 maintenance node and 2 live nodes.
+
+ // Replication factor is adjusted from 3 to 4.
+ // After the change, given 1 maintenance + 2 live is less than the
+ // newFactor, one live nodes will be added.
+ testChangeReplicationFactor(3, 4, 3);
+
+ // Replication factor is adjusted from 3 to 2.
+ // After the change, given 2 live nodes is the same as the newFactor,
+ // no live nodes will be invalidated.
+ testChangeReplicationFactor(3, 2, 2);
+
+ // Replication factor is adjusted from 3 to 1.
+ // After the change, given 2 live nodes is greater than the newFactor,
+ // one live nodes will be invalidated.
+ testChangeReplicationFactor(3, 1, 1);
+ }
+
+ /**
+ * After the change of replication factor, # of live replicas <=
+ * the new replication factor.
+ */
+ private void testChangeReplicationFactor(int oldFactor, int newFactor,
+ int expectedLiveReplicas) throws IOException {
+ LOG.info("Starting testChangeReplicationFactor {} {} {}",
+ oldFactor, newFactor, expectedLiveReplicas);
+ startCluster(1, 5);
+
+ final Path file = new Path("/testChangeReplicationFactor.dat");
+
+ final FileSystem fileSys = getCluster().getFileSystem(0);
+ final FSNamesystem ns = getCluster().getNamesystem(0);
+
+ writeFile(fileSys, file, oldFactor, 1);
+
+ final DatanodeInfo nodeOutofService = takeNodeOutofService(0,
+ getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null,
+ AdminStates.IN_MAINTENANCE);
+
+ // Verify that the nodeOutofService remains in blocksMap and
+ // # of live replicas For read operation is expected.
+ assertNull(checkWithRetry(ns, fileSys, file, oldFactor - 1,
+ nodeOutofService));
+
+ final DFSClient client = getDfsClient(0);
+ client.setReplication(file.toString(), (short)newFactor);
+
+ // Verify that the nodeOutofService remains in blocksMap and
+ // # of live replicas for read operation.
+ assertNull(checkWithRetry(ns, fileSys, file, expectedLiveReplicas,
+ nodeOutofService));
+
+ putNodeInService(0, nodeOutofService.getDatanodeUuid());
+ assertNull(checkWithRetry(ns, fileSys, file, newFactor, null));
+
+ cleanupFile(fileSys, file);
+ }
+
+
+ /**
+ * Verify the following scenario.
+ * a. Put a live node to maintenance => 1 maintenance, 2 live.
+ * b. The maintenance node becomes dead => block map still has 1 maintenance,
+ * 2 live.
+ * c. Take the node out of maintenance => NN should schedule the replication
+ * and end up with 3 live.
+ */
+ @Test(timeout = 360000)
+ public void testTakeDeadNodeOutOfMaintenance() throws Exception {
+ LOG.info("Starting testTakeDeadNodeOutOfMaintenance");
+ final int numNamenodes = 1;
+ final int numDatanodes = 4;
+ startCluster(numNamenodes, numDatanodes);
+
+ final Path file = new Path("/testTakeDeadNodeOutOfMaintenance.dat");
+ final int replicas = 3;
+
+ final FileSystem fileSys = getCluster().getFileSystem(0);
+ final FSNamesystem ns = getCluster().getNamesystem(0);
+ writeFile(fileSys, file, replicas, 1);
+
+ final DatanodeInfo nodeOutofService = takeNodeOutofService(0,
+ getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null,
+ AdminStates.IN_MAINTENANCE);
+
+ assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
+ nodeOutofService));
+
+ final DFSClient client = getDfsClient(0);
+ assertEquals("All datanodes must be alive", numDatanodes,
+ client.datanodeReport(DatanodeReportType.LIVE).length);
+
+ getCluster().stopDataNode(nodeOutofService.getXferAddr());
+ DFSTestUtil.waitForDatanodeState(
+ getCluster(), nodeOutofService.getDatanodeUuid(), false, 20000);
+ assertEquals("maintenance node shouldn't be alive", numDatanodes - 1,
+ client.datanodeReport(DatanodeReportType.LIVE).length);
+
+ // Dead maintenance node's blocks should remain in block map.
+ assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
+ nodeOutofService));
+
+ // When dead maintenance mode is transitioned to out of maintenance mode,
+ // its blocks should be removed from block map.
+ // This will then trigger replication to restore the live replicas back
+ // to replication factor.
+ putNodeInService(0, nodeOutofService.getDatanodeUuid());
+ assertNull(checkWithRetry(ns, fileSys, file, replicas, nodeOutofService,
+ null));
+
+ cleanupFile(fileSys, file);
}
- static protected String getFirstBlockFirstReplicaUuid(FileSystem fileSys,
+
+ /**
+ * Verify the following scenario.
+ * a. Put a live node to maintenance => 1 maintenance, 2 live.
+ * b. The maintenance node becomes dead => block map still has 1 maintenance,
+ * 2 live.
+ * c. Restart nn => block map only has 2 live => restore the 3 live.
+ * d. Restart the maintenance dn => 1 maintenance, 3 live.
+ * e. Take the node out of maintenance => over replication => 3 live.
+ */
+ @Test(timeout = 360000)
+ public void testWithNNAndDNRestart() throws Exception {
+ LOG.info("Starting testWithNNAndDNRestart");
+ final int numNamenodes = 1;
+ final int numDatanodes = 4;
+ startCluster(numNamenodes, numDatanodes);
+
+ final Path file = new Path("/testWithNNAndDNRestart.dat");
+ final int replicas = 3;
+
+ final FileSystem fileSys = getCluster().getFileSystem(0);
+ FSNamesystem ns = getCluster().getNamesystem(0);
+ writeFile(fileSys, file, replicas, 1);
+
+ DatanodeInfo nodeOutofService = takeNodeOutofService(0,
+ getFirstBlockFirstReplicaUuid(fileSys, file), Long.MAX_VALUE, null,
+ AdminStates.IN_MAINTENANCE);
+
+ assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
+ nodeOutofService));
+
+ DFSClient client = getDfsClient(0);
+ assertEquals("All datanodes must be alive", numDatanodes,
+ client.datanodeReport(DatanodeReportType.LIVE).length);
+
+ MiniDFSCluster.DataNodeProperties dnProp =
+ getCluster().stopDataNode(nodeOutofService.getXferAddr());
+ DFSTestUtil.waitForDatanodeState(
+ getCluster(), nodeOutofService.getDatanodeUuid(), false, 20000);
+ assertEquals("maintenance node shouldn't be alive", numDatanodes - 1,
+ client.datanodeReport(DatanodeReportType.LIVE).length);
+
+ // Dead maintenance node's blocks should remain in block map.
+ assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
+ nodeOutofService));
+
+ // restart nn, nn will restore 3 live replicas given it doesn't
+ // know the maintenance node has the replica.
+ getCluster().restartNameNode(0);
+ ns = getCluster().getNamesystem(0);
+ assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
+
+ // restart dn, nn has 1 maintenance replica and 3 live replicas.
+ getCluster().restartDataNode(dnProp, true);
+ getCluster().waitActive();
+ assertNull(checkWithRetry(ns, fileSys, file, replicas, nodeOutofService));
+
+ // Put the node in service, a redundant replica should be removed.
+ putNodeInService(0, nodeOutofService.getDatanodeUuid());
+ assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
+
+ cleanupFile(fileSys, file);
+ }
+
+
+ /**
+ * Machine under maintenance state won't be chosen for new block allocation.
+ */
+ @Test(timeout = 3600000)
+ public void testWriteAfterMaintenance() throws IOException {
+ LOG.info("Starting testWriteAfterMaintenance");
+ startCluster(1, 3);
+
+ final Path file = new Path("/testWriteAfterMaintenance.dat");
+ int replicas = 3;
+
+ final FileSystem fileSys = getCluster().getFileSystem(0);
+ FSNamesystem ns = getCluster().getNamesystem(0);
+
+ final DatanodeInfo nodeOutofService = takeNodeOutofService(0, null,
+ Long.MAX_VALUE, null, AdminStates.IN_MAINTENANCE);
+
+ writeFile(fileSys, file, replicas, 2);
+
+ // Verify nodeOutofService wasn't chosen for write operation.
+ assertNull(checkWithRetry(ns, fileSys, file, replicas - 1,
+ nodeOutofService, null));
+
+ // Put the node back to service, live replicas should be restored.
+ putNodeInService(0, nodeOutofService.getDatanodeUuid());
+ assertNull(checkWithRetry(ns, fileSys, file, replicas, null));
+
+ cleanupFile(fileSys, file);
+ }
+
+ /**
+ * A node has blocks under construction when it is put to maintenance.
+ * Given there are minReplication replicas somewhere else,
+ * it can be transitioned to AdminStates.IN_MAINTENANCE.
+ */
+ @Test(timeout = 360000)
+ public void testEnterMaintenanceWhenFileOpen() throws Exception {
+ LOG.info("Starting testEnterMaintenanceWhenFileOpen");
+ startCluster(1, 3);
+
+ final Path file = new Path("/testEnterMaintenanceWhenFileOpen.dat");
+
+ final FileSystem fileSys = getCluster().getFileSystem(0);
+ writeIncompleteFile(fileSys, file, (short)3, (short)2);
+
+ takeNodeOutofService(0, null, Long.MAX_VALUE, null,
+ AdminStates.IN_MAINTENANCE);
+
+ cleanupFile(fileSys, file);
+ }
+
+ /**
+ * Machine under maintenance state won't be chosen for invalidation.
+ */
+ @Test(timeout = 360000)
+ public void testInvalidation() throws IOException {
+ LOG.info("Starting testInvalidation");
+ int numNamenodes = 1;
+ int numDatanodes = 3;
+ startCluster(numNamenodes, numDatanodes);
+
+ Path file = new Path("/testInvalidation.dat");
+ int replicas = 3;
+
+ FileSystem fileSys = getCluster().getFileSystem(0);
+ FSNamesystem ns = getCluster().getNamesystem(0);
+
+ writeFile(fileSys, file, replicas);
+
+ DatanodeInfo nodeOutofService = takeNodeOutofService(0, null,
+ Long.MAX_VALUE, null, AdminStates.IN_MAINTENANCE);
+
+ DFSClient client = getDfsClient(0);
+ client.setReplication(file.toString(), (short) 1);
+
+ // Verify the nodeOutofService remains in blocksMap.
+ assertNull(checkWithRetry(ns, fileSys, file, 1, nodeOutofService));
+
+ // Restart NN and verify the nodeOutofService remains in blocksMap.
+ getCluster().restartNameNode(0);
+ ns = getCluster().getNamesystem(0);
+ assertNull(checkWithRetry(ns, fileSys, file, 1, nodeOutofService));
+
+ cleanupFile(fileSys, file);
+ }
+
+ static String getFirstBlockFirstReplicaUuid(FileSystem fileSys,
Path name) throws IOException {
+ DatanodeInfo[] nodes = getFirstBlockReplicasDatanodeInfos(fileSys, name);
+ if (nodes != null && nodes.length != 0) {
+ return nodes[0].getDatanodeUuid();
+ } else {
+ return null;
+ }
+ }
+
+ /*
+ * Verify that the number of replicas are as expected for each block in
+ * the given file.
+ *
+ * @return - null if no failure found, else an error message string.
+ */
+ static String checkFile(FSNamesystem ns, FileSystem fileSys,
+ Path name, int repl, DatanodeInfo expectedExcludedNode,
+ DatanodeInfo expectedMaintenanceNode) throws IOException {
// need a raw stream
assertTrue("Not HDFS:"+fileSys.getUri(),
fileSys instanceof DistributedFileSystem);
HdfsDataInputStream dis = (HdfsDataInputStream)fileSys.open(name);
+ BlockManager bm = ns.getBlockManager();
Collection<LocatedBlock> dinfo = dis.getAllBlocks();
+ String output;
for (LocatedBlock blk : dinfo) { // for each block
DatanodeInfo[] nodes = blk.getLocations();
- if (nodes.length > 0) {
- return nodes[0].getDatanodeUuid();
+ for (int j = 0; j < nodes.length; j++) { // for each replica
+ if (expectedExcludedNode != null &&
+ nodes[j].equals(expectedExcludedNode)) {
+ //excluded node must not be in LocatedBlock.
+ output = "For block " + blk.getBlock() + " replica on " +
+ nodes[j] + " found in LocatedBlock.";
+ LOG.info(output);
+ return output;
+ } else {
+ if (nodes[j].isInMaintenance()) {
+ //IN_MAINTENANCE node must not be in LocatedBlock.
+ output = "For block " + blk.getBlock() + " replica on " +
+ nodes[j] + " which is in maintenance state.";
+ LOG.info(output);
+ return output;
+ }
+ }
+ }
+ if (repl != nodes.length) {
+ output = "Wrong number of replicas for block " + blk.getBlock() +
+ ": expected " + repl + ", got " + nodes.length + " ,";
+ for (int j = 0; j < nodes.length; j++) { // for each replica
+ output += nodes[j] + ",";
+ }
+ output += "pending block # " + ns.getPendingReplicationBlocks() + " ,";
+ output += "under replicated # " + ns.getUnderReplicatedBlocks() + " ,";
+ if (expectedExcludedNode != null) {
+ output += "excluded node " + expectedExcludedNode;
+ }
+
+ LOG.info(output);
+ return output;
+ }
+
+ // Verify it has the expected maintenance node
+ Iterator<DatanodeStorageInfo> storageInfoIter =
+ bm.getStorages(blk.getBlock().getLocalBlock()).iterator();
+ List<DatanodeInfo> maintenanceNodes = new ArrayList<>();
+ while (storageInfoIter.hasNext()) {
+ DatanodeInfo node = storageInfoIter.next().getDatanodeDescriptor();
+ if (node.isMaintenance()) {
+ maintenanceNodes.add(node);
+ }
+ }
+
+ if (expectedMaintenanceNode != null) {
+ if (!maintenanceNodes.contains(expectedMaintenanceNode)) {
+ output = "No maintenance replica on " + expectedMaintenanceNode;
+ LOG.info(output);
+ return output;
+ }
+ } else {
+ if (maintenanceNodes.size() != 0) {
+ output = "Has maintenance replica(s)";
+ LOG.info(output);
+ return output;
+ }
}
}
return null;
}
+
+ static String checkWithRetry(FSNamesystem ns, FileSystem fileSys,
+ Path name, int repl, DatanodeInfo inMaintenanceNode)
+ throws IOException {
+ return checkWithRetry(ns, fileSys, name, repl, inMaintenanceNode,
+ inMaintenanceNode);
+ }
+
+ static String checkWithRetry(FSNamesystem ns, FileSystem fileSys,
+ Path name, int repl, DatanodeInfo excludedNode,
+ DatanodeInfo underMaintenanceNode) throws IOException {
+ int tries = 0;
+ String output = null;
+ while (tries++ < 200) {
+ try {
+ Thread.sleep(100);
+ output = checkFile(ns, fileSys, name, repl, excludedNode,
+ underMaintenanceNode);
+ if (output == null) {
+ break;
+ }
+ } catch (InterruptedException ie) {
+ }
+ }
+ return output;
+ }
+
+ static private DatanodeInfo[] getFirstBlockReplicasDatanodeInfos(
+ FileSystem fileSys, Path name) throws IOException {
+ // need a raw stream
+ assertTrue("Not HDFS:"+fileSys.getUri(),
+ fileSys instanceof DistributedFileSystem);
+ HdfsDataInputStream dis = (HdfsDataInputStream)fileSys.open(name);
+ Collection<LocatedBlock> dinfo = dis.getAllBlocks();
+ if (dinfo.iterator().hasNext()) { // for the first block
+ return dinfo.iterator().next().getLocations();
+ } else {
+ return null;
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/b61fb267/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
index 2c7c720..00bea1c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
@@ -415,9 +415,10 @@ public class TestBlockManager {
throws Exception {
assertEquals(0, bm.numOfUnderReplicatedBlocks());
BlockInfo block = addBlockOnNodes(testIndex, origNodes);
- assertFalse(bm.isNeededReconstruction(block, bm.countLiveNodes(block)));
+ assertFalse(bm.isNeededReconstruction(block,
+ bm.countNodes(block, fsn.isInStartupSafeMode())));
}
-
+
@Test(timeout = 60000)
public void testNeededReconstructionWhileAppending() throws IOException {
Configuration conf = new HdfsConfiguration();
@@ -458,7 +459,8 @@ public class TestBlockManager {
namenode.updatePipeline(clientName, oldBlock, newBlock,
newLocatedBlock.getLocations(), newLocatedBlock.getStorageIDs());
BlockInfo bi = bm.getStoredBlock(newBlock.getLocalBlock());
- assertFalse(bm.isNeededReconstruction(bi, bm.countLiveNodes(bi)));
+ assertFalse(bm.isNeededReconstruction(bi, bm.countNodes(bi,
+ cluster.getNamesystem().isInStartupSafeMode())));
} finally {
IOUtils.closeStream(out);
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/b61fb267/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDecommissioningStatus.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDecommissioningStatus.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDecommissioningStatus.java
index 6bb6040..7f1cc9a 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDecommissioningStatus.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDecommissioningStatus.java
@@ -26,17 +26,16 @@ import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
-import java.util.Random;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
-import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.hdfs.AdminStatesBaseTest;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSTestUtil;
@@ -111,47 +110,22 @@ public class TestDecommissioningStatus {
if(cluster != null) cluster.shutdown();
}
- private FSDataOutputStream writeIncompleteFile(FileSystem fileSys, Path name,
- short repl) throws IOException {
- // create and write a file that contains three blocks of data
- FSDataOutputStream stm = fileSys.create(name, true, fileSys.getConf()
- .getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 4096), repl,
- blockSize);
- byte[] buffer = new byte[fileSize];
- Random rand = new Random(seed);
- rand.nextBytes(buffer);
- stm.write(buffer);
- // need to make sure that we actually write out both file blocks
- // (see FSOutputSummer#flush)
- stm.flush();
- // Do not close stream, return it
- // so that it is not garbage collected
- return stm;
- }
-
- static private void cleanupFile(FileSystem fileSys, Path name)
- throws IOException {
- assertTrue(fileSys.exists(name));
- fileSys.delete(name, true);
- assertTrue(!fileSys.exists(name));
- }
-
/*
* Decommissions the node at the given index
*/
- private String decommissionNode(FSNamesystem namesystem, DFSClient client,
+ private String decommissionNode(DFSClient client,
int nodeIndex) throws IOException {
DatanodeInfo[] info = client.datanodeReport(DatanodeReportType.LIVE);
String nodename = info[nodeIndex].getXferAddr();
- decommissionNode(namesystem, nodename);
+ decommissionNode(nodename);
return nodename;
}
/*
* Decommissions the node by name
*/
- private void decommissionNode(FSNamesystem namesystem, String dnName)
+ private void decommissionNode(String dnName)
throws IOException {
System.out.println("Decommissioning node: " + dnName);
@@ -166,14 +140,14 @@ public class TestDecommissioningStatus {
int expectedUnderRepInOpenFiles) {
assertEquals("Unexpected num under-replicated blocks",
expectedUnderRep,
- decommNode.decommissioningStatus.getUnderReplicatedBlocks());
+ decommNode.getLeavingServiceStatus().getUnderReplicatedBlocks());
assertEquals("Unexpected number of decom-only replicas",
expectedDecommissionOnly,
- decommNode.decommissioningStatus.getDecommissionOnlyReplicas());
+ decommNode.getLeavingServiceStatus().getOutOfServiceOnlyReplicas());
assertEquals(
"Unexpected number of replicas in under-replicated open files",
expectedUnderRepInOpenFiles,
- decommNode.decommissioningStatus.getUnderReplicatedInOpenFiles());
+ decommNode.getLeavingServiceStatus().getUnderReplicatedInOpenFiles());
}
private void checkDFSAdminDecommissionStatus(
@@ -237,13 +211,14 @@ public class TestDecommissioningStatus {
short replicas = numDatanodes;
//
// Decommission one node. Verify the decommission status
- //
+ //
Path file1 = new Path("decommission.dat");
DFSTestUtil.createFile(fileSys, file1, fileSize, fileSize, blockSize,
replicas, seed);
Path file2 = new Path("decommission1.dat");
- FSDataOutputStream st1 = writeIncompleteFile(fileSys, file2, replicas);
+ FSDataOutputStream st1 = AdminStatesBaseTest.writeIncompleteFile(fileSys,
+ file2, replicas, (short)(fileSize / blockSize));
for (DataNode d: cluster.getDataNodes()) {
DataNodeTestUtils.triggerBlockReport(d);
}
@@ -251,7 +226,7 @@ public class TestDecommissioningStatus {
FSNamesystem fsn = cluster.getNamesystem();
final DatanodeManager dm = fsn.getBlockManager().getDatanodeManager();
for (int iteration = 0; iteration < numDatanodes; iteration++) {
- String downnode = decommissionNode(fsn, client, iteration);
+ String downnode = decommissionNode(client, iteration);
dm.refreshNodes(conf);
decommissionedNodes.add(downnode);
BlockManagerTestUtil.recheckDecommissionState(dm);
@@ -281,8 +256,8 @@ public class TestDecommissioningStatus {
hostsFileWriter.initExcludeHost("");
dm.refreshNodes(conf);
st1.close();
- cleanupFile(fileSys, file1);
- cleanupFile(fileSys, file2);
+ AdminStatesBaseTest.cleanupFile(fileSys, file1);
+ AdminStatesBaseTest.cleanupFile(fileSys, file2);
}
/**
@@ -308,7 +283,7 @@ public class TestDecommissioningStatus {
// Decommission the DN.
FSNamesystem fsn = cluster.getNamesystem();
final DatanodeManager dm = fsn.getBlockManager().getDatanodeManager();
- decommissionNode(fsn, dnName);
+ decommissionNode(dnName);
dm.refreshNodes(conf);
// Stop the DN when decommission is in progress.
@@ -343,7 +318,7 @@ public class TestDecommissioningStatus {
// Delete the under-replicated file, which should let the
// DECOMMISSION_IN_PROGRESS node become DECOMMISSIONED
- cleanupFile(fileSys, f);
+ AdminStatesBaseTest.cleanupFile(fileSys, f);
BlockManagerTestUtil.recheckDecommissionState(dm);
assertTrue("the node should be decommissioned",
dead.get(0).isDecommissioned());
@@ -376,7 +351,7 @@ public class TestDecommissioningStatus {
FSNamesystem fsn = cluster.getNamesystem();
final DatanodeManager dm = fsn.getBlockManager().getDatanodeManager();
DatanodeDescriptor dnDescriptor = dm.getDatanode(dnID);
- decommissionNode(fsn, dnName);
+ decommissionNode(dnName);
dm.refreshNodes(conf);
BlockManagerTestUtil.recheckDecommissionState(dm);
assertTrue(dnDescriptor.isDecommissioned());
http://git-wip-us.apache.org/repos/asf/hadoop/blob/b61fb267/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNamenodeCapacityReport.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNamenodeCapacityReport.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNamenodeCapacityReport.java
index 38ec9f8..c9fe2c3 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNamenodeCapacityReport.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNamenodeCapacityReport.java
@@ -195,9 +195,17 @@ public class TestNamenodeCapacityReport {
private static final float EPSILON = 0.0001f;
@Test
public void testXceiverCount() throws Exception {
+ testXceiverCountInternal(0);
+ testXceiverCountInternal(1);
+ }
+
+ public void testXceiverCountInternal(int minMaintenanceR) throws Exception {
Configuration conf = new HdfsConfiguration();
// retry one time, if close fails
- conf.setInt(HdfsClientConfigKeys.BlockWrite.LOCATEFOLLOWINGBLOCK_RETRIES_KEY, 1);
+ conf.setInt(
+ HdfsClientConfigKeys.BlockWrite.LOCATEFOLLOWINGBLOCK_RETRIES_KEY, 1);
+ conf.setInt(DFSConfigKeys.DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY,
+ minMaintenanceR);
MiniDFSCluster cluster = null;
final int nodes = 8;
@@ -220,23 +228,23 @@ public class TestNamenodeCapacityReport {
int expectedTotalLoad = nodes; // xceiver server adds 1 to load
int expectedInServiceNodes = nodes;
int expectedInServiceLoad = nodes;
- checkClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes, expectedInServiceLoad);
-
- // shutdown half the nodes and force a heartbeat check to ensure
- // counts are accurate
+ checkClusterHealth(nodes, namesystem, expectedTotalLoad,
+ expectedInServiceNodes, expectedInServiceLoad);
+
+ // Shutdown half the nodes followed by admin operations on those nodes.
+ // Ensure counts are accurate.
for (int i=0; i < nodes/2; i++) {
DataNode dn = datanodes.get(i);
DatanodeDescriptor dnd = dnm.getDatanode(dn.getDatanodeId());
dn.shutdown();
DFSTestUtil.setDatanodeDead(dnd);
BlockManagerTestUtil.checkHeartbeat(namesystem.getBlockManager());
- //Verify decommission of dead node won't impact nodesInService metrics.
- dnm.getDecomManager().startDecommission(dnd);
+ //Admin operations on dead nodes won't impact nodesInService metrics.
+ startDecommissionOrMaintenance(dnm, dnd, (i % 2 == 0));
expectedInServiceNodes--;
assertEquals(expectedInServiceNodes, namesystem.getNumLiveDataNodes());
assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
- //Verify recommission of dead node won't impact nodesInService metrics.
- dnm.getDecomManager().stopDecommission(dnd);
+ stopDecommissionOrMaintenance(dnm, dnd, (i % 2 == 0));
assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
}
@@ -247,8 +255,9 @@ public class TestNamenodeCapacityReport {
datanodes = cluster.getDataNodes();
expectedInServiceNodes = nodes;
assertEquals(nodes, datanodes.size());
- checkClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes, expectedInServiceLoad);
-
+ checkClusterHealth(nodes, namesystem, expectedTotalLoad,
+ expectedInServiceNodes, expectedInServiceLoad);
+
// create streams and hsync to force datastreamers to start
DFSOutputStream[] streams = new DFSOutputStream[fileCount];
for (int i=0; i < fileCount; i++) {
@@ -263,30 +272,32 @@ public class TestNamenodeCapacityReport {
}
// force nodes to send load update
triggerHeartbeats(datanodes);
- checkClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes, expectedInServiceLoad);
+ checkClusterHealth(nodes, namesystem, expectedTotalLoad,
+ expectedInServiceNodes, expectedInServiceLoad);
- // decomm a few nodes, substract their load from the expected load,
- // trigger heartbeat to force load update
+ // admin operations on a few nodes, substract their load from the
+ // expected load, trigger heartbeat to force load update.
for (int i=0; i < fileRepl; i++) {
expectedInServiceNodes--;
DatanodeDescriptor dnd =
dnm.getDatanode(datanodes.get(i).getDatanodeId());
expectedInServiceLoad -= dnd.getXceiverCount();
- dnm.getDecomManager().startDecommission(dnd);
+ startDecommissionOrMaintenance(dnm, dnd, (i % 2 == 0));
DataNodeTestUtils.triggerHeartbeat(datanodes.get(i));
Thread.sleep(100);
- checkClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes, expectedInServiceLoad);
+ checkClusterHealth(nodes, namesystem, expectedTotalLoad,
+ expectedInServiceNodes, expectedInServiceLoad);
}
-
+
// check expected load while closing each stream. recalc expected
// load based on whether the nodes in the pipeline are decomm
for (int i=0; i < fileCount; i++) {
- int decomm = 0;
+ int adminOps = 0;
for (DatanodeInfo dni : streams[i].getPipeline()) {
DatanodeDescriptor dnd = dnm.getDatanode(dni);
expectedTotalLoad -= 2;
- if (dnd.isDecommissionInProgress() || dnd.isDecommissioned()) {
- decomm++;
+ if (!dnd.isInService()) {
+ adminOps++;
} else {
expectedInServiceLoad -= 2;
}
@@ -297,16 +308,17 @@ public class TestNamenodeCapacityReport {
// nodes will go decommissioned even if there's a UC block whose
// other locations are decommissioned too. we'll ignore that
// bug for now
- if (decomm < fileRepl) {
+ if (adminOps < fileRepl) {
throw ioe;
}
}
triggerHeartbeats(datanodes);
// verify node count and loads
- checkClusterHealth(nodes, namesystem, expectedTotalLoad, expectedInServiceNodes, expectedInServiceLoad);
+ checkClusterHealth(nodes, namesystem, expectedTotalLoad,
+ expectedInServiceNodes, expectedInServiceLoad);
}
- // shutdown each node, verify node counts based on decomm state
+ // shutdown each node, verify node counts based on admin state
for (int i=0; i < nodes; i++) {
DataNode dn = datanodes.get(i);
dn.shutdown();
@@ -320,13 +332,11 @@ public class TestNamenodeCapacityReport {
expectedInServiceNodes--;
}
assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
-
// live nodes always report load of 1. no nodes is load 0
double expectedXceiverAvg = (i == nodes-1) ? 0.0 : 1.0;
assertEquals((double)expectedXceiverAvg,
getInServiceXceiverAverage(namesystem), EPSILON);
}
-
// final sanity check
checkClusterHealth(0, namesystem, 0.0, 0, 0.0);
} finally {
@@ -336,6 +346,24 @@ public class TestNamenodeCapacityReport {
}
}
+ private void startDecommissionOrMaintenance(DatanodeManager dnm,
+ DatanodeDescriptor dnd, boolean decomm) {
+ if (decomm) {
+ dnm.getDecomManager().startDecommission(dnd);
+ } else {
+ dnm.getDecomManager().startMaintenance(dnd, Long.MAX_VALUE);
+ }
+ }
+
+ private void stopDecommissionOrMaintenance(DatanodeManager dnm,
+ DatanodeDescriptor dnd, boolean decomm) {
+ if (decomm) {
+ dnm.getDecomManager().stopDecommission(dnd);
+ } else {
+ dnm.getDecomManager().stopMaintenance(dnd);
+ }
+ }
+
private static void checkClusterHealth(
int numOfLiveNodes,
FSNamesystem namesystem, double expectedTotalLoad,
http://git-wip-us.apache.org/repos/asf/hadoop/blob/b61fb267/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/util/HostsFileWriter.java
----------------------------------------------------------------------
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/util/HostsFileWriter.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/util/HostsFileWriter.java
index 4c8fcef..e171e2b 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/util/HostsFileWriter.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/util/HostsFileWriter.java
@@ -54,6 +54,7 @@ public class HostsFileWriter {
localFileSys = FileSystem.getLocal(conf);
Path workingDir = new Path(MiniDFSCluster.getBaseDirectory());
this.fullDir = new Path(workingDir, dir);
+ cleanup(); // In case there is some left over from previous run.
assertTrue(localFileSys.mkdirs(this.fullDir));
if (conf.getClass(
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org