You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by zh...@apache.org on 2018/01/23 03:07:48 UTC

hbase git commit: HBASE-19838 Can not shutdown backup master cleanly when it has already tried to become the active master

Repository: hbase
Updated Branches:
  refs/heads/master 541f8ad8a -> 970636c5a


HBASE-19838 Can not shutdown backup master cleanly when it has already tried to become the active master

On Master@shutdown, close the shared Master connection to kill any
ongoing RPCs by hosted clients.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
Call close ont the Master shared clusterconnection to kill any ongoing
rpcs.

M hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
Remove guts of close; we were closing the Masters connection....not
our responsibility.

Added unit test written by Duo Zhang which demonstrates the case where
Master will not go down.

Signed-off-by: zhangduo <zh...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/970636c5
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/970636c5
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/970636c5

Branch: refs/heads/master
Commit: 970636c5afbd1a12a998af3e8b0825f806bedeca
Parents: 541f8ad
Author: Michael Stack <st...@apache.org>
Authored: Mon Jan 22 14:44:16 2018 -0800
Committer: zhangduo <zh...@apache.org>
Committed: Tue Jan 23 11:07:00 2018 +0800

----------------------------------------------------------------------
 .../org/apache/hadoop/hbase/master/HMaster.java |  21 +++-
 .../hadoop/hbase/master/ServerManager.java      |  16 +--
 .../hbase/regionserver/HRegionServer.java       |   3 +
 .../hbase/master/TestShutdownBackupMaster.java  | 108 +++++++++++++++++++
 4 files changed, 132 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/970636c5/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index 09b18bc..88d8596 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -2605,7 +2605,7 @@ public class HMaster extends HRegionServer implements MasterServices {
   }
 
   @Override
-  public void abort(final String msg, final Throwable t) {
+  public void abort(String reason, Throwable cause) {
     if (isAborted() || isStopped()) {
       return;
     }
@@ -2614,8 +2614,9 @@ public class HMaster extends HRegionServer implements MasterServices {
       LOG.error(HBaseMarkers.FATAL, "Master server abort: loaded coprocessors are: " +
           getLoadedCoprocessors());
     }
-    if (t != null) {
-      LOG.error(HBaseMarkers.FATAL, msg, t);
+    String msg = "***** ABORTING master " + this + ": " + reason + " *****";
+    if (cause != null) {
+      LOG.error(HBaseMarkers.FATAL, msg, cause);
     } else {
       LOG.error(HBaseMarkers.FATAL, msg);
     }
@@ -2666,14 +2667,19 @@ public class HMaster extends HRegionServer implements MasterServices {
     return rsFatals;
   }
 
+  /**
+   * Shutdown the cluster.
+   * Master runs a coordinated stop of all RegionServers and then itself.
+   */
   public void shutdown() throws IOException {
     if (cpHost != null) {
       cpHost.preShutdown();
     }
-
+    // Tell the servermanager cluster is down.
     if (this.serverManager != null) {
       this.serverManager.shutdownCluster();
     }
+    // Set the cluster down flag; broadcast across the cluster.
     if (this.clusterStatusTracker != null){
       try {
         this.clusterStatusTracker.setClusterDown();
@@ -2681,6 +2687,13 @@ public class HMaster extends HRegionServer implements MasterServices {
         LOG.error("ZooKeeper exception trying to set cluster as down in ZK", e);
       }
     }
+    // Shutdown our cluster connection. This will kill any hosted RPCs that might be going on;
+    // this is what we want especially if the Master is in startup phase doing call outs to
+    // hbase:meta, etc. when cluster is down. Without ths connection close, we'd have to wait on
+    // the rpc to timeout.
+    if (this.clusterConnection != null) {
+      this.clusterConnection.close();
+    }
   }
 
   public void stopMaster() throws IOException {

http://git-wip-us.apache.org/repos/asf/hbase/blob/970636c5/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index 3db6033..3c84bfb 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -200,10 +200,8 @@ public class ServerManager {
     Configuration c = master.getConfiguration();
     maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
     warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
-    this.connection = connect ? master.getClusterConnection() : null;
-    this.rpcControllerFactory = this.connection == null
-        ? null
-        : connection.getRpcControllerFactory();
+    this.connection = connect? master.getClusterConnection(): null;
+    this.rpcControllerFactory = this.connection == null? null: connection.getRpcControllerFactory();
   }
 
   /**
@@ -968,16 +966,10 @@ public class ServerManager {
   }
 
   /**
-   * Stop the ServerManager.  Currently closes the connection to the master.
+   * Stop the ServerManager.
    */
   public void stop() {
-    if (connection != null) {
-      try {
-        connection.close();
-      } catch (IOException e) {
-        LOG.error("Attempt to close connection to master failed", e);
-      }
-    }
+    // Nothing to do.
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hbase/blob/970636c5/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index 036a5c2..3844415 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -251,6 +251,9 @@ public class HRegionServer extends HasThread implements
    * Cluster connection to be shared by services.
    * Initialized at server startup and closed when server shuts down.
    * Clients must never close it explicitly.
+   * Clients hosted by this Server should make use of this clusterConnection rather than create
+   * their own; if they create their own, there is no way for the hosting server to shutdown
+   * ongoing client RPCs.
    */
   protected ClusterConnection clusterConnection;
 

http://git-wip-us.apache.org/repos/asf/hbase/blob/970636c5/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestShutdownBackupMaster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestShutdownBackupMaster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestShutdownBackupMaster.java
new file mode 100644
index 0000000..02d7f2f
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestShutdownBackupMaster.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master;
+
+import static org.junit.Assert.assertNotNull;
+
+import java.io.IOException;
+import java.util.concurrent.CountDownLatch;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.CategoryBasedTimeout;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.MiniHBaseCluster;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
+import org.apache.zookeeper.KeeperException;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.rules.TestRule;
+
+/**
+ * Test to confirm that we will not hang when stop a backup master which is trying to become the
+ * active master. See HBASE-19838
+ */
+@Category({ MasterTests.class, MediumTests.class })
+public class TestShutdownBackupMaster {
+  @Rule public final TestRule timeout = CategoryBasedTimeout.builder().withTimeout(this.getClass()).
+          withLookingForStuckThread(true).build();
+
+  private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
+
+  private static volatile CountDownLatch ARRIVE;
+
+  private static volatile CountDownLatch CONTINUE;
+
+  public static final class MockHMaster extends HMaster {
+
+    public MockHMaster(Configuration conf) throws IOException, KeeperException {
+      super(conf);
+    }
+
+    @Override
+    void initClusterSchemaService() throws IOException, InterruptedException {
+      if (ARRIVE != null) {
+        ARRIVE.countDown();
+        CONTINUE.await();
+      }
+      super.initClusterSchemaService();
+    }
+  }
+
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    UTIL.getConfiguration().setClass(HConstants.MASTER_IMPL, MockHMaster.class, HMaster.class);
+    UTIL.startMiniCluster(2, 2);
+    UTIL.waitUntilAllSystemRegionsAssigned();
+  }
+
+  @AfterClass
+  public static void tearDownAfterClass() throws Exception {
+    // make sure that we can stop the cluster cleanly
+    UTIL.shutdownMiniCluster();
+  }
+
+  @Test
+  public void testShutdownWhileBecomingActive() throws InterruptedException {
+    MiniHBaseCluster cluster = UTIL.getHBaseCluster();
+    HMaster activeMaster = null;
+    HMaster backupMaster = null;
+    for (MasterThread t : cluster.getMasterThreads()) {
+      if (t.getMaster().isActiveMaster()) {
+        activeMaster = t.getMaster();
+      } else {
+        backupMaster = t.getMaster();
+      }
+    }
+    assertNotNull(activeMaster);
+    assertNotNull(backupMaster);
+    ARRIVE = new CountDownLatch(1);
+    CONTINUE = new CountDownLatch(1);
+    activeMaster.abort("Aborting active master for test");
+    // wait until we arrive the initClusterSchemaService
+    ARRIVE.await();
+    // killall RSes
+    cluster.getRegionServerThreads().stream().map(t -> t.getRegionServer())
+        .forEachOrdered(rs -> rs.abort("Aborting RS for test"));
+    CONTINUE.countDown();
+  }
+}