You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2017/03/08 16:14:51 UTC

hbase git commit: HBASE-17718 Difference between RS's servername and its ephemeral node cause SSH stop working; AMENDMENT.

Repository: hbase
Updated Branches:
  refs/heads/master e239e8d2d -> 6a57050c2


HBASE-17718 Difference between RS's servername and its ephemeral node cause SSH stop working; AMENDMENT.

Make test tighter by extending ServerListener so can find when Master
is in the waiting-on-regionservers state and making more assertions
about state.

Fix error where I would move on from waiting-on-regionservers if
we had waited max time.


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/6a57050c
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/6a57050c
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/6a57050c

Branch: refs/heads/master
Commit: 6a57050c24100438508199c9856b95be7024803a
Parents: e239e8d
Author: Michael Stack <st...@apache.org>
Authored: Wed Mar 8 04:58:53 2017 -0800
Committer: Michael Stack <st...@apache.org>
Committed: Wed Mar 8 08:14:24 2017 -0800

----------------------------------------------------------------------
 .../org/apache/hadoop/hbase/master/HMaster.java |  8 +-
 .../hadoop/hbase/master/ServerListener.java     | 15 ++--
 .../hadoop/hbase/master/ServerManager.java      | 49 ++++++-----
 .../hbase/zookeeper/DrainingServerTracker.java  | 22 ++---
 .../TestRSKilledWhenInitializing.java           | 88 ++++++++++++--------
 5 files changed, 103 insertions(+), 79 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/6a57050c/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index 501d3bd..a1cbe53 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -305,7 +305,7 @@ public class HMaster extends HRegionServer implements MasterServices {
   MemoryBoundedLogMessageBuffer rsFatals;
 
   // flag set after we become the active master (used for testing)
-  private volatile boolean isActiveMaster = false;
+  private volatile boolean activeMaster = false;
 
   // flag set after we complete initialization once active,
   // it is not private since it's used in unit tests
@@ -597,7 +597,7 @@ public class HMaster extends HRegionServer implements MasterServices {
   @Override
   protected void waitForMasterActive(){
     boolean tablesOnMaster = BaseLoadBalancer.tablesOnMaster(conf);
-    while (!(tablesOnMaster && isActiveMaster)
+    while (!(tablesOnMaster && activeMaster)
         && !isStopped() && !isAborted()) {
       sleeper.sleep();
     }
@@ -733,7 +733,7 @@ public class HMaster extends HRegionServer implements MasterServices {
   private void finishActiveMasterInitialization(MonitoredTask status)
       throws IOException, InterruptedException, KeeperException, CoordinatedStateException {
 
-    isActiveMaster = true;
+    activeMaster = true;
     Thread zombieDetector = new Thread(new InitializationMonitor(this),
         "ActiveMasterInitializationMonitor-" + System.currentTimeMillis());
     zombieDetector.start();
@@ -2555,7 +2555,7 @@ public class HMaster extends HRegionServer implements MasterServices {
    */
   @Override
   public boolean isActiveMaster() {
-    return isActiveMaster;
+    return activeMaster;
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hbase/blob/6a57050c/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerListener.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerListener.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerListener.java
index f168686..7946735 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerListener.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerListener.java
@@ -22,20 +22,25 @@ import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.hbase.ServerName;
 
 /**
- * Get notification of server events. The invocations are inline
- * so make sure your implementation is fast else you'll slow hbase.
+ * Get notification of server registration events. The invocations are inline
+ * so make sure your implementation is fast or else you'll slow hbase.
  */
 @InterfaceAudience.Private
 public interface ServerListener {
   /**
+   * Started waiting on RegionServers to check-in.
+   */
+  default void waiting() {};
+
+  /**
    * The server has joined the cluster.
    * @param serverName The remote servers name.
    */
-  void serverAdded(final ServerName serverName);
+  default void serverAdded(final ServerName serverName) {};
 
   /**
    * The server was removed from the cluster.
    * @param serverName The remote servers name.
    */
-  void serverRemoved(final ServerName serverName);
-}
+  default void serverRemoved(final ServerName serverName) {};
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hbase/blob/6a57050c/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index e6b60d8..db0a0e5 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -53,6 +53,7 @@ import org.apache.hadoop.hbase.ZooKeeperConnectionException;
 import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.hbase.client.ClusterConnection;
 import org.apache.hadoop.hbase.client.RetriesExhaustedException;
+import org.apache.hadoop.hbase.ipc.FailedServerException;
 import org.apache.hadoop.hbase.ipc.HBaseRpcController;
 import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
 import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer;
@@ -772,7 +773,16 @@ public class ServerManager {
    */
   private void checkForRSznode(final ServerName serverName, final ServiceException se) {
     if (se.getCause() == null) return;
-    if (!(se.getCause() instanceof ConnectException)) return;
+    Throwable t = se.getCause();
+    if (t instanceof ConnectException) {
+      // If this, proceed to do cleanup.
+    } else {
+      // Look for FailedServerException
+      if (!(t instanceof IOException)) return;
+      if (t.getCause() == null) return;
+      if (!(t.getCause() instanceof FailedServerException)) return;
+      // Ok, found FailedServerException -- continue.
+    }
     if (!isServerOnline(serverName)) return;
     // We think this server is online. Check it has a znode up. Currently, a RS
     // registers an ephereral znode in zk. If not present, something is up. Maybe
@@ -1030,20 +1040,19 @@ public class ServerManager {
    *
    * @throws InterruptedException
    */
-  public void waitForRegionServers(MonitoredTask status)
-  throws InterruptedException {
+  public void waitForRegionServers(MonitoredTask status) throws InterruptedException {
     final long interval = this.master.getConfiguration().
-      getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500);
+        getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500);
     final long timeout = this.master.getConfiguration().
-      getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500);
+        getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500);
     // Min is not an absolute; just a friction making us wait longer on server checkin.
     int minToStart = getMinToStart();
     int maxToStart = this.master.getConfiguration().
-      getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE);
+        getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE);
     if (maxToStart < minToStart) {
       LOG.warn(String.format("The value of '%s' (%d) is set less than '%s' (%d), ignoring.",
-        WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart,
-        WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
+          WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart,
+          WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
       maxToStart = Integer.MAX_VALUE;
     }
 
@@ -1060,19 +1069,19 @@ public class ServerManager {
     // Next, we will keep cycling if ANY of the following three conditions are true:
     // 1. The time since a regionserver registered is < interval (means servers are actively checking in).
     // 2. We are under the total timeout.
-    // 3. The count of servers is < minimum expected AND we are within timeout (this just puts up
-    // a little friction making us wait a bit longer if < minimum servers).
+    // 3. The count of servers is < minimum.
+    for (ServerListener listener: this.listeners) {
+      listener.waiting();
+    }
     while (!this.master.isStopped() && count < maxToStart &&
-        (((lastCountChange + interval) > now) ||
-            (timeout > slept) ||
-            ((count < minToStart) && (timeout > slept)))) {
+        ((lastCountChange + interval) > now || timeout > slept || count < minToStart)) {
       // Log some info at every interval time or if there is a change
       if (oldCount != count || lastLogTime + interval < now) {
         lastLogTime = now;
         String msg =
-          "Waiting for RegionServer count=" + count + " to settle; waited "+
-            slept + "ms, expecting minimum=" + minToStart + "server(s) (max="+ getStrForMax(maxToStart) + "server(s)), " +
-            "timeout=" + timeout + "ms, lastChange=" + (lastCountChange - now) + "ms";
+            "Waiting on RegionServer count=" + count + " to settle; waited="+
+                slept + "ms, expecting min=" + minToStart + " server(s), max="+ getStrForMax(maxToStart) +
+                " server(s), " + "timeout=" + timeout + "ms, lastChange=" + (lastCountChange - now) + "ms";
         LOG.info(msg);
         status.setStatus(msg);
       }
@@ -1089,11 +1098,9 @@ public class ServerManager {
         lastCountChange = now;
       }
     }
-
-    LOG.info("Finished waiting for RegionServer count=" + count + " to settle, slept for " + slept + "ms," +
-      " expecting minimum=" + minToStart + " server(s) (max=" +  getStrForMax(maxToStart) + " server(s),"+
-      " Master is "+ (this.master.isStopped() ? "stopped.": "running")
-    );
+    LOG.info("Finished wait on RegionServer count=" + count + "; waited=" + slept + "ms," +
+        " expected min=" + minToStart + " server(s), max=" +  getStrForMax(maxToStart) + " server(s),"+
+        " master is "+ (this.master.isStopped() ? "stopped.": "running"));
   }
 
   private String getStrForMax(final int max) {

http://git-wip-us.apache.org/repos/asf/hbase/blob/6a57050c/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/DrainingServerTracker.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/DrainingServerTracker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/DrainingServerTracker.java
index 32e0862..a4880df 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/DrainingServerTracker.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/DrainingServerTracker.java
@@ -70,22 +70,14 @@ public class DrainingServerTracker extends ZooKeeperListener {
   public void start() throws KeeperException, IOException {
     watcher.registerListener(this);
     // Add a ServerListener to check if a server is draining when it's added.
-    serverManager.registerListener(
-        new ServerListener() {
-
-          @Override
-          public void serverAdded(ServerName sn) {
-            if (drainingServers.contains(sn)){
-              serverManager.addServerToDrainList(sn);
-            }
-          }
-
-          @Override
-          public void serverRemoved(ServerName serverName) {
-            // no-op
-          }
+    serverManager.registerListener(new ServerListener() {
+      @Override
+      public void serverAdded(ServerName sn) {
+        if (drainingServers.contains(sn)){
+          serverManager.addServerToDrainList(sn);
         }
-    );
+      }
+    });
     List<String> servers =
       ZKUtil.listChildrenAndWatchThem(watcher, watcher.znodePaths.drainingZNode);
     add(servers);

http://git-wip-us.apache.org/repos/asf/hbase/blob/6a57050c/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java
index c01db2a..304bfe7 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenInitializing.java
@@ -19,6 +19,7 @@
 package org.apache.hadoop.hbase.regionserver;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
 import java.util.List;
@@ -38,6 +39,7 @@ import org.apache.hadoop.hbase.LocalHBaseCluster;
 import org.apache.hadoop.hbase.MiniHBaseCluster;
 import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.master.HMaster;
+import org.apache.hadoop.hbase.master.ServerListener;
 import org.apache.hadoop.hbase.master.ServerManager;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionServerStartupResponse;
 import org.apache.hadoop.hbase.testclassification.MediumTests;
@@ -92,52 +94,57 @@ public class TestRSKilledWhenInitializing {
             RegisterAndDieRegionServer.class);
     final MasterThread master = startMaster(cluster.getMasters().get(0));
     try {
-      masterActive.set(true);
-      // Now start regionservers.
-      // First RS to report for duty will kill itself when it gets a response.
-      // See below in the RegisterAndDieRegionServer handleReportForDutyResponse.
+      // Master is up waiting on RegionServers to check in. Now start RegionServers.
       for (int i = 0; i < NUM_RS; i++) {
         cluster.getRegionServers().get(i).start();
       }
-      // Now wait on master to see NUM_RS + 1 servers as being online, NUM_RS and itself.
-      // Then wait until the killed RS gets removed from zk and triggers Master to remove
-      // it from list of online RS.
-      List<ServerName> onlineServersList =
-          master.getMaster().getServerManager().getOnlineServersList();
-      while (onlineServersList.size() < NUM_RS + 1) {
-        // Spin till we see NUM_RS + Master in online servers list.
+      // Now wait on master to see NUM_RS + 1 servers as being online, thats NUM_RS plus
+      // the Master itself (because Master hosts hbase:meta and checks in as though it a RS).
+      List<ServerName> onlineServersList = null;
+      do {
         onlineServersList = master.getMaster().getServerManager().getOnlineServersList();
-      }
-      LOG.info(onlineServersList);
-      assertEquals(NUM_RS + 1, onlineServersList.size());
-      // Steady state. How many regions open?
-      // Wait until killedRS is set
+      } while (onlineServersList.size() < (NUM_RS + 1));
+      // Wait until killedRS is set. Means RegionServer is starting to go down.
       while (killedRS.get() == null) {
-        Threads.sleep(10);
+        Threads.sleep(1);
+      }
+      // Wait on the RegionServer to fully die.
+      while (cluster.getLiveRegionServers().size() > NUM_RS) {
+        Threads.sleep(1);
+      }
+      // Make sure Master is fully up before progressing. Could take a while if regions
+      // being reassigned.
+      while (!master.getMaster().isInitialized()) {
+        Threads.sleep(1);
       }
-      final int regionsOpenCount = master.getMaster().getAssignmentManager().getNumRegionsOpened();
+
+      // Now in steady state. How many regions open? Master should have too many regionservers
+      // showing still. The downed RegionServer should still be showing as registered.
+      assertTrue(master.getMaster().getServerManager().isServerOnline(killedRS.get()));
       // Find non-meta region (namespace?) and assign to the killed server. That'll trigger cleanup.
-      Map<HRegionInfo, ServerName> assigments =
-          master.getMaster().getAssignmentManager().getRegionStates().getRegionAssignments();
+      Map<HRegionInfo, ServerName> assignments = null;
+      do {
+        assignments = master.getMaster().getAssignmentManager().getRegionStates().getRegionAssignments();
+      } while (assignments == null || assignments.size() < 2);
       HRegionInfo hri = null;
-      for (Map.Entry<HRegionInfo, ServerName> e: assigments.entrySet()) {
+      for (Map.Entry<HRegionInfo, ServerName> e: assignments.entrySet()) {
         if (e.getKey().isMetaRegion()) continue;
         hri = e.getKey();
         break;
       }
       // Try moving region to the killed server. It will fail. As by-product, we will
       // remove the RS from Master online list because no corresponding znode.
+      assertEquals(NUM_RS + 1, master.getMaster().getServerManager().getOnlineServersList().size());
       LOG.info("Move " + hri.getEncodedName() + " to " + killedRS.get());
       master.getMaster().move(hri.getEncodedNameAsBytes(),
           Bytes.toBytes(killedRS.get().toString()));
-      while (onlineServersList.size() > NUM_RS) {
+      // Wait until the RS no longer shows as registered in Master.
+      while (onlineServersList.size() > (NUM_RS + 1)) {
         Thread.sleep(100);
         onlineServersList = master.getMaster().getServerManager().getOnlineServersList();
       }
-      // Just for kicks, ensure namespace was put back on the old server after above failed move.
-      assertEquals(regionsOpenCount,
-          master.getMaster().getAssignmentManager().getNumRegionsOpened());
     } finally {
+      // Shutdown is messy with complaints about fs being closed. Why? TODO.
       cluster.shutdown();
       cluster.join();
       TEST_UTIL.shutdownMiniDFSCluster();
@@ -146,19 +153,32 @@ public class TestRSKilledWhenInitializing {
     }
   }
 
+  /**
+   * Start Master. Get as far as the state where Master is waiting on
+   * RegionServers to check in, then return.
+   */
   private MasterThread startMaster(MasterThread master) {
     master.start();
-    long startTime = System.currentTimeMillis();
-    while (!master.getMaster().isInitialized()) {
-      try {
-        Thread.sleep(100);
-      } catch (InterruptedException ignored) {
-        LOG.info("Interrupted: ignoring");
-      }
-      if (System.currentTimeMillis() > startTime + 30000) {
-        throw new RuntimeException("Master not active after 30 seconds");
+    // It takes a while until ServerManager creation to happen inside Master startup.
+    while (master.getMaster().getServerManager() == null) {
+      continue;
+    }
+    // Set a listener for the waiting-on-RegionServers state. We want to wait
+    // until this condition before we leave this method and start regionservers.
+    final AtomicBoolean waiting = new AtomicBoolean(false);
+    if (master.getMaster().getServerManager() == null) throw new NullPointerException("SM");
+    master.getMaster().getServerManager().registerListener(new ServerListener() {
+      @Override
+      public void waiting() {
+        waiting.set(true);
       }
+    });
+    // Wait until the Master gets to place where it is waiting on RegionServers to check in.
+    while (!waiting.get()) {
+      continue;
     }
+    // Set the global master-is-active; gets picked up by regionservers later.
+    masterActive.set(true);
     return master;
   }