You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ma...@apache.org on 2012/11/19 07:52:25 UTC

svn commit: r1411081 - in /hadoop/common/branches/branch-1.1: ./ src/hdfs/ src/hdfs/org/apache/hadoop/hdfs/ src/hdfs/org/apache/hadoop/hdfs/server/namenode/ src/test/org/apache/hadoop/hdfs/

Author: mattf
Date: Mon Nov 19 06:52:24 2012
New Revision: 1411081

URL: http://svn.apache.org/viewvc?rev=1411081&view=rev
Log:
merged r1368353 from branch-1. HDFS-528. Backport: Add ability for safemode to wait for a minimum number of live datanodes.  Contributed by Tsz Wo Sze.

Modified:
    hadoop/common/branches/branch-1.1/CHANGES.txt   (contents, props changed)
    hadoop/common/branches/branch-1.1/src/hdfs/hdfs-default.xml
    hadoop/common/branches/branch-1.1/src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java
    hadoop/common/branches/branch-1.1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
    hadoop/common/branches/branch-1.1/src/test/org/apache/hadoop/hdfs/TestSafeMode.java

Modified: hadoop/common/branches/branch-1.1/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.1/CHANGES.txt?rev=1411081&r1=1411080&r2=1411081&view=diff
==============================================================================
--- hadoop/common/branches/branch-1.1/CHANGES.txt (original)
+++ hadoop/common/branches/branch-1.1/CHANGES.txt Mon Nov 19 06:52:24 2012
@@ -6,6 +6,9 @@ Release 1.1.1 - Unreleased
 
   NEW FEATURES
 
+    HDFS-528. Backport: Add ability for safemode to wait for a minimum number
+    of live datanodes.  (szetszwo)
+
   IMPROVEMENTS
 
     HADOOP-8823. ant package target should not depend on cn-docs. (szetszwo)

Propchange: hadoop/common/branches/branch-1.1/CHANGES.txt
------------------------------------------------------------------------------
  Merged /hadoop/common/branches/branch-1/CHANGES.txt:r1368353

Modified: hadoop/common/branches/branch-1.1/src/hdfs/hdfs-default.xml
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.1/src/hdfs/hdfs-default.xml?rev=1411081&r1=1411080&r2=1411081&view=diff
==============================================================================
--- hadoop/common/branches/branch-1.1/src/hdfs/hdfs-default.xml (original)
+++ hadoop/common/branches/branch-1.1/src/hdfs/hdfs-default.xml Mon Nov 19 06:52:24 2012
@@ -304,9 +304,24 @@ creations/deletions), or "all".</descrip
   <description>
     Specifies the percentage of blocks that should satisfy 
     the minimal replication requirement defined by dfs.replication.min.
-    Values less than or equal to 0 mean not to start in safe mode.
+    Values less than or equal to 0 mean not to wait for any particular
+    percentage of blocks before exiting safemode.
     Values greater than 1 will make safe mode permanent.
   </description>
+ </property>
+ 
+<property>
+  <name>dfs.namenode.safemode.min.datanodes</name>
+  <value>0</value>
+  <description>
+    Specifies the number of datanodes that must be considered alive
+    before the name node exits safemode.
+    Values less than or equal to 0 mean not to take the number of live
+    datanodes into account when deciding whether to remain in safe mode
+    during startup.
+    Values greater than the number of datanodes in the cluster
+    will make safe mode permanent.
+  </description>
 </property>
 
 <property>

Modified: hadoop/common/branches/branch-1.1/src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.1/src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java?rev=1411081&r1=1411080&r2=1411081&view=diff
==============================================================================
--- hadoop/common/branches/branch-1.1/src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java (original)
+++ hadoop/common/branches/branch-1.1/src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java Mon Nov 19 06:52:24 2012
@@ -72,6 +72,8 @@ public class DFSConfigKeys extends Commo
   public static final int     DFS_NAMENODE_SAFEMODE_EXTENSION_DEFAULT = 30000;
   public static final String  DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY = "dfs.namenode.safemode.threshold-pct";
   public static final float   DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT = 0.999f;
+  public static final String  DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY = "dfs.namenode.safemode.min.datanodes";
+  public static final int     DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT = 0;
   public static final String  DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY = "dfs.namenode.secondary.http-address";
   public static final String  DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_DEFAULT = "0.0.0.0:50090";
   public static final String  DFS_NAMENODE_CHECKPOINT_PERIOD_KEY = "dfs.namenode.checkpoint.period";

Modified: hadoop/common/branches/branch-1.1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java?rev=1411081&r1=1411080&r2=1411081&view=diff
==============================================================================
--- hadoop/common/branches/branch-1.1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java (original)
+++ hadoop/common/branches/branch-1.1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java Mon Nov 19 06:52:24 2012
@@ -2616,6 +2616,10 @@ public class FSNamesystem implements FSC
       // no need to update its timestamp
       // because its is done when the descriptor is created
     }
+
+    if (safeMode != null) {
+      safeMode.checkMode();
+    }
     return;
   }
     
@@ -3403,6 +3407,10 @@ public class FSNamesystem implements FSC
     }
     unprotectedRemoveDatanode(nodeInfo);
     clusterMap.remove(nodeInfo);
+
+    if (safeMode != null) {
+      safeMode.checkMode();
+    }
   }
 
   void unprotectedRemoveDatanode(DatanodeDescriptor nodeDescr) {
@@ -4281,6 +4289,10 @@ public class FSNamesystem implements FSC
     }
   }
 
+  int getNumLiveDataNodes() {
+    return getNumberOfDatanodes(DatanodeReportType.LIVE);
+  }
+
   int getNumberOfDatanodes(DatanodeReportType type) {
     return getDatanodeListForReport(type).size(); 
   }
@@ -4815,6 +4827,8 @@ public class FSNamesystem implements FSC
     // configuration fields
     /** Safe mode threshold condition %.*/
     private double threshold;
+    /** Safe mode minimum number of datanodes alive */
+    private int datanodeThreshold;
     /** Safe mode extension after the threshold. */
     private int extension;
     /** Min replication required by safe mode. */
@@ -4842,6 +4856,9 @@ public class FSNamesystem implements FSC
      */
     SafeModeInfo(Configuration conf) {
       this.threshold = conf.getFloat("dfs.safemode.threshold.pct", 0.95f);
+      this.datanodeThreshold = conf.getInt(
+          DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY,
+          DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT);
       this.extension = conf.getInt("dfs.safemode.extension", 0);
       this.safeReplication = conf.getInt("dfs.replication.min", 1);
       this.blockTotal = 0; 
@@ -4858,6 +4875,7 @@ public class FSNamesystem implements FSC
      */
     private SafeModeInfo() {
       this.threshold = 1.5f;  // this threshold can never be reached
+      this.datanodeThreshold = Integer.MAX_VALUE;
       this.extension = Integer.MAX_VALUE;
       this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication
       this.blockTotal = -1;
@@ -4956,7 +4974,8 @@ public class FSNamesystem implements FSC
      * if DFS is empty or {@link #threshold} == 0
      */
     boolean needEnter() {
-      return getSafeBlockRatio() < threshold;
+      return getSafeBlockRatio() < threshold ||
+          getNumLiveDataNodes() < datanodeThreshold;
     }
       
     /**
@@ -5053,15 +5072,44 @@ public class FSNamesystem implements FSC
       }
       if(blockTotal < 0)
         return leaveMsg + ".";
-      String safeBlockRatioMsg = 
-        String.format("The ratio of reported blocks %.4f has " +
-          (reached == 0 ? "not " : "") + "reached the threshold %.4f. ",
-          getSafeBlockRatio(), threshold) + leaveMsg;
-      if(reached == 0 || isManual())  // threshold is not reached or manual
-        return safeBlockRatioMsg + ".";
+
+      int numLive = getNumLiveDataNodes();
+      String msg = "";
+      if (reached == 0) {
+        if (getSafeBlockRatio() < threshold) {
+          msg += String.format(
+            "The reported blocks is only %d"
+            + " but the threshold is %.4f and the total blocks %d.",
+            blockSafe, threshold, blockTotal);
+        }
+        if (numLive < datanodeThreshold) {
+          if (!"".equals(msg)) {
+            msg += "\n";
+          }
+          msg += String.format(
+            "The number of live datanodes %d needs an additional %d live "
+            + "datanodes to reach the minimum number %d.",
+            numLive, (datanodeThreshold - numLive), datanodeThreshold);
+        }
+        msg += " " + leaveMsg;
+      } else {
+        msg = String.format("The reported blocks %d has reached the threshold"
+            + " %.4f of total blocks %d.", blockSafe, threshold, 
+            blockTotal);
+
+        if (datanodeThreshold > 0) {
+          msg += String.format(" The number of live datanodes %d has reached "
+                               + "the minimum number %d.",
+                               numLive, datanodeThreshold);
+        }
+        msg += " " + leaveMsg;
+      }
+      if(reached == 0 || isManual()) {  // threshold is not reached or manual       
+        return msg + ".";
+      }
       // extension period is in progress
-      return safeBlockRatioMsg + " in " 
-            + Math.abs(reached + extension - now())/1000 + " seconds.";
+      return msg + " in " + Math.abs(reached + extension - now()) / 1000
+          + " seconds.";
     }
 
     /**
@@ -5239,7 +5287,7 @@ public class FSNamesystem implements FSC
     safeMode.leave(checkForUpgrades);
   }
     
-  String getSafeModeTip() {
+  public String getSafeModeTip() {
     if (!isInSafeMode())
       return "";
     return safeMode.getTurnOffTip();

Modified: hadoop/common/branches/branch-1.1/src/test/org/apache/hadoop/hdfs/TestSafeMode.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1.1/src/test/org/apache/hadoop/hdfs/TestSafeMode.java?rev=1411081&r1=1411080&r2=1411081&view=diff
==============================================================================
--- hadoop/common/branches/branch-1.1/src/test/org/apache/hadoop/hdfs/TestSafeMode.java (original)
+++ hadoop/common/branches/branch-1.1/src/test/org/apache/hadoop/hdfs/TestSafeMode.java Mon Nov 19 06:52:24 2012
@@ -99,6 +99,50 @@ public class TestSafeMode {
     }
   }
 
+  /**
+   * Verify that the NameNode stays in safemode when dfs.safemode.datanode.min
+   * is set to a number greater than the number of live datanodes.
+   */
+  @Test
+  public void testDatanodeThreshold() throws IOException {
+    MiniDFSCluster cluster = null;
+    DistributedFileSystem fs = null;
+    try {
+      Configuration conf = new Configuration();
+      conf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0);
+      conf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY, 1);
+
+      // bring up a cluster with no datanodes
+      cluster = new MiniDFSCluster(conf, 0, true, null);
+      cluster.waitActive();
+      fs = (DistributedFileSystem)cluster.getFileSystem();
+
+      assertTrue("No datanode started, but we require one - safemode expected",
+                 fs.setSafeMode(SafeModeAction.SAFEMODE_GET));
+
+      String tipMsg = cluster.getNameNode().getNamesystem().getSafeModeTip();
+      assertTrue("Safemode tip message looks right",
+                 tipMsg.contains("The number of live datanodes 0 needs an " +
+                                 "additional 1 live"));
+
+      // Start a datanode
+      cluster.startDataNodes(conf, 1, true, null, null);
+
+      // Wait long enough for safemode check to refire
+      try {
+        Thread.sleep(1000);
+      } catch (InterruptedException ignored) {}
+
+      // We now should be out of safe mode.
+      assertFalse(
+        "Out of safe mode after starting datanode.",
+        fs.setSafeMode(SafeModeAction.SAFEMODE_GET));
+    } finally {
+      if (fs != null) fs.close();
+      if (cluster != null) cluster.shutdown();
+    }
+  }
+
   @Test
   public void testSafeModeWhenZeroBlockLocations() throws IOException {
     MiniDFSCluster cluster = null;