You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by sz...@apache.org on 2012/08/02 07:22:43 UTC
svn commit: r1368353 - in /hadoop/common/branches/branch-1: ./ src/hdfs/
src/hdfs/org/apache/hadoop/hdfs/
src/hdfs/org/apache/hadoop/hdfs/server/namenode/
src/test/org/apache/hadoop/hdfs/
Author: szetszwo
Date: Thu Aug 2 05:22:43 2012
New Revision: 1368353
URL: http://svn.apache.org/viewvc?rev=1368353&view=rev
Log:
HDFS-528. Backport: Add ability for safemode to wait for a minimum number of live datanodes.
Modified:
hadoop/common/branches/branch-1/CHANGES.txt
hadoop/common/branches/branch-1/src/hdfs/hdfs-default.xml
hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java
hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/TestSafeMode.java
Modified: hadoop/common/branches/branch-1/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/CHANGES.txt?rev=1368353&r1=1368352&r2=1368353&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/CHANGES.txt (original)
+++ hadoop/common/branches/branch-1/CHANGES.txt Thu Aug 2 05:22:43 2012
@@ -19,6 +19,9 @@ Release 1.2.0 - unreleased
MAPREDUCE-987. Exposing MiniDFS and MiniMR clusters as a single process command-line (philip and ahmed via tucu)
+ HDFS-528. Backport: Add ability for safemode to wait for a minimum number
+ of live datanodes. (szetszwo)
+
IMPROVEMENTS
HDFS-3515. Port HDFS-1457 to branch-1. (eli)
Modified: hadoop/common/branches/branch-1/src/hdfs/hdfs-default.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/hdfs/hdfs-default.xml?rev=1368353&r1=1368352&r2=1368353&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/src/hdfs/hdfs-default.xml (original)
+++ hadoop/common/branches/branch-1/src/hdfs/hdfs-default.xml Thu Aug 2 05:22:43 2012
@@ -304,9 +304,24 @@ creations/deletions), or "all".</descrip
<description>
Specifies the percentage of blocks that should satisfy
the minimal replication requirement defined by dfs.replication.min.
- Values less than or equal to 0 mean not to start in safe mode.
+ Values less than or equal to 0 mean not to wait for any particular
+ percentage of blocks before exiting safemode.
Values greater than 1 will make safe mode permanent.
</description>
+ </property>
+
+<property>
+ <name>dfs.namenode.safemode.min.datanodes</name>
+ <value>0</value>
+ <description>
+ Specifies the number of datanodes that must be considered alive
+ before the name node exits safemode.
+ Values less than or equal to 0 mean not to take the number of live
+ datanodes into account when deciding whether to remain in safe mode
+ during startup.
+ Values greater than the number of datanodes in the cluster
+ will make safe mode permanent.
+ </description>
</property>
<property>
Modified: hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java?rev=1368353&r1=1368352&r2=1368353&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java (original)
+++ hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java Thu Aug 2 05:22:43 2012
@@ -72,6 +72,8 @@ public class DFSConfigKeys extends Commo
public static final int DFS_NAMENODE_SAFEMODE_EXTENSION_DEFAULT = 30000;
public static final String DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY = "dfs.namenode.safemode.threshold-pct";
public static final float DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT = 0.999f;
+ public static final String DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY = "dfs.namenode.safemode.min.datanodes";
+ public static final int DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT = 0;
public static final String DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY = "dfs.namenode.secondary.http-address";
public static final String DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_DEFAULT = "0.0.0.0:50090";
public static final String DFS_NAMENODE_CHECKPOINT_PERIOD_KEY = "dfs.namenode.checkpoint.period";
Modified: hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java?rev=1368353&r1=1368352&r2=1368353&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java (original)
+++ hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java Thu Aug 2 05:22:43 2012
@@ -2532,6 +2532,10 @@ public class FSNamesystem implements FSC
// no need to update its timestamp
// because its is done when the descriptor is created
}
+
+ if (safeMode != null) {
+ safeMode.checkMode();
+ }
return;
}
@@ -3321,6 +3325,10 @@ public class FSNamesystem implements FSC
}
unprotectedRemoveDatanode(nodeInfo);
clusterMap.remove(nodeInfo);
+
+ if (safeMode != null) {
+ safeMode.checkMode();
+ }
}
void unprotectedRemoveDatanode(DatanodeDescriptor nodeDescr) {
@@ -4199,6 +4207,10 @@ public class FSNamesystem implements FSC
}
}
+ int getNumLiveDataNodes() {
+ return getNumberOfDatanodes(DatanodeReportType.LIVE);
+ }
+
int getNumberOfDatanodes(DatanodeReportType type) {
return getDatanodeListForReport(type).size();
}
@@ -4733,6 +4745,8 @@ public class FSNamesystem implements FSC
// configuration fields
/** Safe mode threshold condition %.*/
private double threshold;
+ /** Safe mode minimum number of datanodes alive */
+ private int datanodeThreshold;
/** Safe mode extension after the threshold. */
private int extension;
/** Min replication required by safe mode. */
@@ -4760,6 +4774,9 @@ public class FSNamesystem implements FSC
*/
SafeModeInfo(Configuration conf) {
this.threshold = conf.getFloat("dfs.safemode.threshold.pct", 0.95f);
+ this.datanodeThreshold = conf.getInt(
+ DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY,
+ DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT);
this.extension = conf.getInt("dfs.safemode.extension", 0);
this.safeReplication = conf.getInt("dfs.replication.min", 1);
this.blockTotal = 0;
@@ -4776,6 +4793,7 @@ public class FSNamesystem implements FSC
*/
private SafeModeInfo() {
this.threshold = 1.5f; // this threshold can never be reached
+ this.datanodeThreshold = Integer.MAX_VALUE;
this.extension = Integer.MAX_VALUE;
this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication
this.blockTotal = -1;
@@ -4874,7 +4892,8 @@ public class FSNamesystem implements FSC
* if DFS is empty or {@link #threshold} == 0
*/
boolean needEnter() {
- return getSafeBlockRatio() < threshold;
+ return getSafeBlockRatio() < threshold ||
+ getNumLiveDataNodes() < datanodeThreshold;
}
/**
@@ -4971,15 +4990,44 @@ public class FSNamesystem implements FSC
}
if(blockTotal < 0)
return leaveMsg + ".";
- String safeBlockRatioMsg =
- String.format("The ratio of reported blocks %.4f has " +
- (reached == 0 ? "not " : "") + "reached the threshold %.4f. ",
- getSafeBlockRatio(), threshold) + leaveMsg;
- if(reached == 0 || isManual()) // threshold is not reached or manual
- return safeBlockRatioMsg + ".";
+
+ int numLive = getNumLiveDataNodes();
+ String msg = "";
+ if (reached == 0) {
+ if (getSafeBlockRatio() < threshold) {
+ msg += String.format(
+ "The reported blocks is only %d"
+ + " but the threshold is %.4f and the total blocks %d.",
+ blockSafe, threshold, blockTotal);
+ }
+ if (numLive < datanodeThreshold) {
+ if (!"".equals(msg)) {
+ msg += "\n";
+ }
+ msg += String.format(
+ "The number of live datanodes %d needs an additional %d live "
+ + "datanodes to reach the minimum number %d.",
+ numLive, (datanodeThreshold - numLive), datanodeThreshold);
+ }
+ msg += " " + leaveMsg;
+ } else {
+ msg = String.format("The reported blocks %d has reached the threshold"
+ + " %.4f of total blocks %d.", blockSafe, threshold,
+ blockTotal);
+
+ if (datanodeThreshold > 0) {
+ msg += String.format(" The number of live datanodes %d has reached "
+ + "the minimum number %d.",
+ numLive, datanodeThreshold);
+ }
+ msg += " " + leaveMsg;
+ }
+ if(reached == 0 || isManual()) { // threshold is not reached or manual
+ return msg + ".";
+ }
// extension period is in progress
- return safeBlockRatioMsg + " in "
- + Math.abs(reached + extension - now())/1000 + " seconds.";
+ return msg + " in " + Math.abs(reached + extension - now()) / 1000
+ + " seconds.";
}
/**
@@ -5157,7 +5205,7 @@ public class FSNamesystem implements FSC
safeMode.leave(checkForUpgrades);
}
- String getSafeModeTip() {
+ public String getSafeModeTip() {
if (!isInSafeMode())
return "";
return safeMode.getTurnOffTip();
Modified: hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/TestSafeMode.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/TestSafeMode.java?rev=1368353&r1=1368352&r2=1368353&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/TestSafeMode.java (original)
+++ hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/TestSafeMode.java Thu Aug 2 05:22:43 2012
@@ -99,6 +99,50 @@ public class TestSafeMode {
}
}
+ /**
+ * Verify that the NameNode stays in safemode when dfs.safemode.datanode.min
+ * is set to a number greater than the number of live datanodes.
+ */
+ @Test
+ public void testDatanodeThreshold() throws IOException {
+ MiniDFSCluster cluster = null;
+ DistributedFileSystem fs = null;
+ try {
+ Configuration conf = new Configuration();
+ conf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0);
+ conf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY, 1);
+
+ // bring up a cluster with no datanodes
+ cluster = new MiniDFSCluster(conf, 0, true, null);
+ cluster.waitActive();
+ fs = (DistributedFileSystem)cluster.getFileSystem();
+
+ assertTrue("No datanode started, but we require one - safemode expected",
+ fs.setSafeMode(SafeModeAction.SAFEMODE_GET));
+
+ String tipMsg = cluster.getNameNode().getNamesystem().getSafeModeTip();
+ assertTrue("Safemode tip message looks right",
+ tipMsg.contains("The number of live datanodes 0 needs an " +
+ "additional 1 live"));
+
+ // Start a datanode
+ cluster.startDataNodes(conf, 1, true, null, null);
+
+ // Wait long enough for safemode check to refire
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException ignored) {}
+
+ // We now should be out of safe mode.
+ assertFalse(
+ "Out of safe mode after starting datanode.",
+ fs.setSafeMode(SafeModeAction.SAFEMODE_GET));
+ } finally {
+ if (fs != null) fs.close();
+ if (cluster != null) cluster.shutdown();
+ }
+ }
+
@Test
public void testSafeModeWhenZeroBlockLocations() throws IOException {
MiniDFSCluster cluster = null;