You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by cu...@apache.org on 2007/06/15 23:45:04 UTC
svn commit: r547800 - in /lucene/hadoop/trunk: CHANGES.txt
src/java/org/apache/hadoop/dfs/FSNamesystem.java
Author: cutting
Date: Fri Jun 15 14:45:03 2007
New Revision: 547800
URL: http://svn.apache.org/viewvc?view=rev&rev=547800
Log:
HADOOP-1300. Improve removal of excess block replicas to be rack-aware. Contributed by Hairong.
Modified:
lucene/hadoop/trunk/CHANGES.txt
lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java
Modified: lucene/hadoop/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?view=diff&rev=547800&r1=547799&r2=547800
==============================================================================
--- lucene/hadoop/trunk/CHANGES.txt (original)
+++ lucene/hadoop/trunk/CHANGES.txt Fri Jun 15 14:45:03 2007
@@ -140,6 +140,10 @@
44. HADOOP-1482. Fix secondary namenode to roll info port.
(Dhruba Borthakur via cutting)
+ 45. HADOOP-1300. Improve removal of excess block replicas to be
+ rack-aware. Attempts are now made to keep replicas on more
+ racks. (Hairong Kuang via cutting)
+
Release 0.13.0 - 2007-06-08
Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java?view=diff&rev=547800&r1=547799&r2=547800
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java Fri Jun 15 14:45:03 2007
@@ -29,6 +29,7 @@
import java.io.*;
import java.util.*;
+import java.util.Map.Entry;
/***************************************************
* FSNamesystem does the actual bookkeeping work for the
@@ -2179,18 +2180,57 @@
*
* srcNodes.size() - dstNodes.size() == replication
*
- * We pick node with least free space
- * In the future, we might enforce some kind of policy
- * (like making sure replicates are spread across racks).
+ * We pick node that make sure that replicas are spread across racks and
+ * also try hard to pick one with least free space.
+ * The algorithm is first to pick a node with least free space from nodes
+ * that are on a rack holding more than one replicas of the block.
+ * So removing such a replica won't remove a rack.
+ * If no such a node is available,
+ * then pick a node with least free space
*/
void chooseExcessReplicates(Collection<DatanodeDescriptor> nonExcess,
Block b, short replication) {
+ // first form a rack to datanodes map and
+ HashMap<String, ArrayList<DatanodeDescriptor>> rackMap =
+ new HashMap<String, ArrayList<DatanodeDescriptor>>();
+ for (Iterator<DatanodeDescriptor> iter = nonExcess.iterator();
+ iter.hasNext();) {
+ DatanodeDescriptor node = iter.next();
+ String rackName = node.getNetworkLocation();
+ ArrayList<DatanodeDescriptor> datanodeList = rackMap.get(rackName);
+ if(datanodeList==null) {
+ datanodeList = new ArrayList<DatanodeDescriptor>();
+ }
+ datanodeList.add(node);
+ rackMap.put(rackName, datanodeList);
+ }
+
+ // split nodes into two sets
+ // priSet contains nodes on rack with more than one replica
+ // remains contains the remaining nodes
+ ArrayList<DatanodeDescriptor> priSet = new ArrayList<DatanodeDescriptor>();
+ ArrayList<DatanodeDescriptor> remains = new ArrayList<DatanodeDescriptor>();
+ for( Iterator<Entry<String, ArrayList<DatanodeDescriptor>>> iter =
+ rackMap.entrySet().iterator(); iter.hasNext(); ) {
+ Entry<String, ArrayList<DatanodeDescriptor>> rackEntry = iter.next();
+ ArrayList<DatanodeDescriptor> datanodeList = rackEntry.getValue();
+ if( datanodeList.size() == 1 ) {
+ remains.add(datanodeList.get(0));
+ } else {
+ priSet.addAll(datanodeList);
+ }
+ }
+
+ // pick one node with least space from priSet if it is not empty
+ // otherwise one node with least space from remains
while (nonExcess.size() - replication > 0) {
DatanodeInfo cur = null;
long minSpace = Long.MAX_VALUE;
-
- for (Iterator<DatanodeDescriptor> iter = nonExcess.iterator(); iter.hasNext();) {
- DatanodeInfo node = iter.next();
+
+ Iterator<DatanodeDescriptor> iter =
+ priSet.isEmpty() ? remains.iterator() : priSet.iterator();
+ while( iter.hasNext() ) {
+ DatanodeDescriptor node = iter.next();
long free = node.getRemaining();
if (minSpace > free) {
@@ -2198,7 +2238,24 @@
cur = node;
}
}
-
+
+ // adjust rackmap, priSet, and remains
+ String rack = cur.getNetworkLocation();
+ ArrayList<DatanodeDescriptor> datanodes = rackMap.get(rack);
+ datanodes.remove(cur);
+ if(datanodes.isEmpty()) {
+ rackMap.remove(rack);
+ }
+ if (priSet.isEmpty()) {
+ remains.remove(cur);
+ } else {
+ priSet.remove(cur);
+ if (datanodes.size() == 1) {
+ priSet.remove(datanodes.get(0));
+ remains.add(datanodes.get(0));
+ }
+ }
+
nonExcess.remove(cur);
Collection<Block> excessBlocks = excessReplicateMap.get(cur.getStorageID());