You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by we...@apache.org on 2019/04/16 18:10:24 UTC
[hadoop] branch branch-2.8 updated: HDFS-10477. Stop decommission a
rack of DataNodes caused NameNode fail over to standby. Contributed by
yunjiong zhao, Wei-Chiu Chuang and star.
This is an automated email from the ASF dual-hosted git repository.
weichiu pushed a commit to branch branch-2.8
in repository https://gitbox.apache.org/repos/asf/hadoop.git
The following commit(s) were added to refs/heads/branch-2.8 by this push:
new 93394c8 HDFS-10477. Stop decommission a rack of DataNodes caused NameNode fail over to standby. Contributed by yunjiong zhao, Wei-Chiu Chuang and star.
93394c8 is described below
commit 93394c8f5541c49aa54ee26d45f7dc77f86ec02c
Author: Wei-Chiu Chuang <we...@apache.org>
AuthorDate: Tue Apr 16 11:09:16 2019 -0700
HDFS-10477. Stop decommission a rack of DataNodes caused NameNode fail over to standby. Contributed by yunjiong zhao, Wei-Chiu Chuang and star.
Signed-off-by: Wei-Chiu Chuang <we...@apache.org>
---
.../hdfs/server/blockmanagement/BlockManager.java | 53 ++++++++++++++++------
1 file changed, 39 insertions(+), 14 deletions(-)
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
index e2bdfcb..4012f57 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@@ -3430,21 +3430,46 @@ public class BlockManager implements BlockStatsMXBean {
if (!isPopulatingReplQueues()) {
return;
}
- final Iterator<BlockInfo> it = srcNode.getBlockIterator();
+
int numOverReplicated = 0;
- while(it.hasNext()) {
- final BlockInfo block = it.next();
- short expectedReplication = block.getReplication();
- NumberReplicas num = countNodes(block);
- int numCurrentReplica = num.liveReplicas();
- if (numCurrentReplica > expectedReplication) {
- // over-replicated block
- processOverReplicatedBlock(block, expectedReplication, null, null);
- numOverReplicated++;
- }
- }
- LOG.info("Invalidated " + numOverReplicated + " over-replicated blocks on " +
- srcNode + " during recommissioning");
+ for (DatanodeStorageInfo datanodeStorageInfo : srcNode.getStorageInfos()) {
+ // the namesystem lock is released between iterations. Make sure the
+ // storage is not removed before continuing.
+ if (srcNode.getStorageInfo(datanodeStorageInfo.getStorageID()) == null) {
+ continue;
+ }
+ final Iterator<BlockInfo> it = datanodeStorageInfo.getBlockIterator();
+ while (it.hasNext()) {
+ final BlockInfo block = it.next();
+ if (block.isDeleted()) {
+ //Orphan block, will be handled eventually, skip
+ continue;
+ }
+ short expectedReplication = this.getExpectedReplicaNum(block);
+ NumberReplicas num = countNodes(block);
+ int numCurrentReplica = num.liveReplicas();
+ if (numCurrentReplica > expectedReplication) {
+ // over-replicated block
+ processOverReplicatedBlock(block, expectedReplication, null,
+ null);
+ numOverReplicated++;
+ }
+ }
+ // When called by tests like TestDefaultBlockPlacementPolicy.
+ // testPlacementWithLocalRackNodesDecommissioned, it is not protected by
+ // lock, only when called by DatanodeManager.refreshNodes have writeLock
+ if (namesystem.hasWriteLock()) {
+ namesystem.writeUnlock();
+ try {
+ Thread.sleep(1);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ }
+ namesystem.writeLock();
+ }
+ }
+ LOG.info("Invalidated " + numOverReplicated +
+ " over-replicated blocks on " + srcNode + " during recommissioning");
}
/**
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org