You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by li...@apache.org on 2013/03/29 19:18:51 UTC

svn commit: r1462571 - /hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java

Author: liyin
Date: Fri Mar 29 18:18:51 2013
New Revision: 1462571

URL: http://svn.apache.org/r1462571
Log:
[0.89-fb] [HBASE-8216] Enable the serverManager to respect the config change for Power failures

Author: aaiyer

Summary:
ServerManager has a bug in which we only re-evaluate information about
the rack failure, after all the servers are back. Need to fix this.

Test Plan:
run MR tests.
Do a rack failure testing on DL and ensure that the region servers
time out correctly, when part of the rack recovers.

Reviewers: liyintang, kannan, kranganathan

Reviewed By: liyintang

Differential Revision: https://phabricator.fb.com/D750252

Modified:
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1462571&r1=1462570&r2=1462571&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Fri Mar 29 18:18:51 2013
@@ -1164,7 +1164,7 @@ public class ServerManager {
   boolean expireTimedOutServers(long timeout, int maxServersToExpire) {
     long curTime = EnvironmentEdgeManager.currentTimeMillis();
     boolean waitingForMoreServersInRackToTimeOut = false;
-    boolean reportDetails = false;
+    boolean reportDetails = LOG.isTraceEnabled();
     int serverCount = serversToLoad.size();
     if ((curTime > lastDetailedLogAt + (3600 * 1000)) ||
         lastLoggedServerCount != serverCount) {
@@ -1215,7 +1215,10 @@ public class ServerManager {
       if (reportDetails) {
         LOG.debug("server=" + si.getServerName() + " rack=" + rack +
             " timed-out=" + timedOut + " expired=" + expired +
-            " timeOfLastPingFromServer=" + timeOfLastPingFromThisServer);
+            " timeOfLastPingFromServer=" + timeOfLastPingFromThisServer +
+            " timeOfLastPingFromThisRack=" + timeOfLastPingFromThisRack +
+            " load.expireAfter =" + load.expireAfter
+            );
       }
       if (!timedOut) {
         continue;
@@ -1242,7 +1245,7 @@ public class ServerManager {
     // this rack was received
 
     for (String rack : rackLastReportAtMap.keySet()) {
-      if (! rackTimedOutServersMap.keySet().contains(rack)) {
+      if (!rackTimedOutServersMap.keySet().contains(rack)) {
         if (inaccessibleRacks.remove(rack)) {
           LOG.info("rack " + rack + " has become accessible");
         }
@@ -1251,7 +1254,6 @@ public class ServerManager {
 
     Set<HServerAddress> specialServers = this.getRootAndMetaServers();
 
-    next_rack:
     for (Map.Entry<String, List<HServerInfo>> e:
       rackTimedOutServersMap.entrySet()) {
       String rack = e.getKey();
@@ -1273,6 +1275,9 @@ public class ServerManager {
         if (load.expireAfter == Long.MAX_VALUE) {
           load.expireAfter = lastHeardFromRackAt + timeout;
           long timeToExpiry = load.expireAfter - curTime;
+          LOG.debug("Setting load.expireAfter to " + load.expireAfter +
+              " for " + si.getServerName() +
+              " timeToExpiry  is " + timeToExpiry);
           if (timeToExpiry > 0) { // is first time
             LOG.info("No report from server " + si.getServerName() +
                 " for last " + (curTime - load.lastLoadRefreshTime) +
@@ -1281,10 +1286,18 @@ public class ServerManager {
         }
         if (curTime > load.expireAfter) {
           numExpired++;
+          LOG.debug("server=" + si.getServerName() + " rack=" + rack +
+              " curTime=" + curTime +
+              " load.expireAfter =" + load.expireAfter
+              + "numExpired++"
+              );
         } else {
           // wait for all the timed-out servers to become ready to expire
           waitingForMoreServersInRackToTimeOut = true;
-          continue next_rack;
+          LOG.debug("server=" + si.getServerName() + " rack=" + rack +
+            " curTime=" + curTime +
+            " load.expireAfter =" + load.expireAfter
+            + " waitingForMoreServersInRackToTimeOut set to true");
         }
       }
       int cappedMaxServersToExpire = Math.min(maxServersToExpire,
@@ -1311,6 +1324,8 @@ public class ServerManager {
             specialServersInRack = true;
           } else  {
             load.expireAfter = Long.MAX_VALUE;
+            LOG.debug("Resetting load.expireAfter : to Long.MAX_VALUE for " +
+                  si.getServerName());
           }
         }
         if (!inaccessibleRacks.contains(rack)) {
@@ -1323,8 +1338,6 @@ public class ServerManager {
               + (specialServersInRack? " the rest": " any")
               + ", hoping for rack" + " to become accessible");
         }
-        if (specialServersInRack)
-          continue next_rack;
       }
       for (HServerInfo si : timedOutServers) {
         HServerLoad load = serversToLoad.get(si.getServerName());
@@ -1339,6 +1352,9 @@ public class ServerManager {
               " no report for last " + (curTime - load.lastLoadRefreshTime)
               + " (no znode expired yet)");
           this.expireServer(si);
+        } else {
+          LOG.debug("Checking again server=" + si.getServerName() +
+              " curTime=" + curTime + " load.expireAfter =" + load.expireAfter);
         }
       }
     }