You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by zg...@apache.org on 2018/10/29 02:11:59 UTC

hbase git commit: HBASE-21325 Force to terminate regionserver when abort hang in somewhere

Repository: hbase
Updated Branches:
  refs/heads/master 7cdb52519 -> c0b994b0c


HBASE-21325 Force to terminate regionserver when abort hang in somewhere


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/c0b994b0
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/c0b994b0
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/c0b994b0

Branch: refs/heads/master
Commit: c0b994b0c6dcf7b8f6bf9b44491c0a312953c014
Parents: 7cdb525
Author: Guanghao Zhang <zg...@apache.org>
Authored: Fri Oct 19 19:34:04 2018 +0800
Committer: Guanghao Zhang <zg...@apache.org>
Committed: Mon Oct 29 10:08:51 2018 +0800

----------------------------------------------------------------------
 .../hbase/regionserver/HRegionServer.java       |  38 +++++
 .../TestRegionServerAbortTimeout.java           | 156 +++++++++++++++++++
 2 files changed, 194 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/c0b994b0/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index 7adf58e..68eb006 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -38,6 +38,8 @@ import java.util.Map.Entry;
 import java.util.Objects;
 import java.util.Set;
 import java.util.SortedMap;
+import java.util.Timer;
+import java.util.TimerTask;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.concurrent.ConcurrentHashMap;
@@ -316,6 +318,11 @@ public class HRegionServer extends HasThread implements
   // Go down hard. Used if file system becomes unavailable and also in
   // debugging and unit tests.
   private volatile boolean abortRequested;
+  public static final String ABORT_TIMEOUT = "hbase.regionserver.abort.timeout";
+  // Default abort timeout is 1200 seconds for safe
+  private static final long DEFAULT_ABORT_TIMEOUT = 1200000;
+  // Will run this task when abort timeout
+  public static final String ABORT_TIMEOUT_TASK = "hbase.regionserver.abort.timeout.task";
 
   ConcurrentMap<String, Integer> rowlocks = new ConcurrentHashMap<>();
 
@@ -1026,6 +1033,22 @@ public class HRegionServer extends HasThread implements
         abort(prefix + t.getMessage(), t);
       }
     }
+
+    if (abortRequested) {
+      Timer abortMonitor = new Timer("Abort regionserver monitor", true);
+      TimerTask abortTimeoutTask = null;
+      try {
+        abortTimeoutTask =
+            Class.forName(conf.get(ABORT_TIMEOUT_TASK, SystemExitWhenAbortTimeout.class.getName()))
+                .asSubclass(TimerTask.class).getDeclaredConstructor().newInstance();
+      } catch (Exception e) {
+        LOG.warn("Initialize abort timeout task failed", e);
+      }
+      if (abortTimeoutTask != null) {
+        abortMonitor.schedule(abortTimeoutTask, conf.getLong(ABORT_TIMEOUT, DEFAULT_ABORT_TIMEOUT));
+      }
+    }
+
     if (this.leases != null) {
       this.leases.closeAfterLeasesExpire();
     }
@@ -3799,4 +3822,19 @@ public class HRegionServer extends HasThread implements
   public boolean isShutDown() {
     return shutDown;
   }
+
+  /**
+   * Force to terminate region server when abort timeout.
+   */
+  private static class SystemExitWhenAbortTimeout extends TimerTask {
+
+    public SystemExitWhenAbortTimeout() {
+    }
+
+    @Override
+    public void run() {
+      LOG.warn("Aborting region server timed out, terminate forcibly...");
+      System.exit(1);
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/c0b994b0/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java
new file mode 100644
index 0000000..f134519
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java
@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.regionserver;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.util.Optional;
+import java.util.TimerTask;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.StartMiniClusterOption;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
+import org.apache.hadoop.hbase.coprocessor.ObserverContext;
+import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor;
+import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
+import org.apache.hadoop.hbase.coprocessor.RegionObserver;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.testclassification.RegionServerTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.Threads;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Category({ RegionServerTests.class, MediumTests.class })
+public class TestRegionServerAbortTimeout {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+      HBaseClassTestRule.forClass(TestRegionServerAbortTimeout.class);
+
+  private static final Logger LOG = LoggerFactory.getLogger(TestRegionServerAbortTimeout.class);
+
+  private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
+
+  private static TableName TABLE_NAME = TableName.valueOf("RSAbort");
+
+  private static byte[] CF = Bytes.toBytes("cf");
+
+  private static byte[] CQ = Bytes.toBytes("cq");
+
+  private static final int REGIONS_NUM = 5;
+
+  private static final int SLEEP_TIME_WHEN_CLOSE_REGION = 1000;
+
+  private static volatile boolean abortTimeoutTaskScheduled = false;
+
+  @BeforeClass
+  public static void setUp() throws Exception {
+    Configuration conf = UTIL.getConfiguration();
+    // Will schedule a abort timeout task after SLEEP_TIME_WHEN_CLOSE_REGION ms
+    conf.setLong(HRegionServer.ABORT_TIMEOUT, SLEEP_TIME_WHEN_CLOSE_REGION);
+    conf.set(HRegionServer.ABORT_TIMEOUT_TASK, TestAbortTimeoutTask.class.getName());
+    StartMiniClusterOption option = StartMiniClusterOption.builder().numRegionServers(2).build();
+    UTIL.startMiniCluster(option);
+    TableDescriptor td = TableDescriptorBuilder.newBuilder(TABLE_NAME)
+        .setCoprocessor(SleepWhenCloseCoprocessor.class.getName())
+        .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(CF).build()).build();
+    UTIL.getAdmin().createTable(td, Bytes.toBytes("0"), Bytes.toBytes("9"), REGIONS_NUM);
+  }
+
+  @AfterClass
+  public static void tearDown() throws Exception {
+    UTIL.getAdmin().disableTable(TABLE_NAME);
+    UTIL.getAdmin().deleteTable(TABLE_NAME);
+    UTIL.shutdownMiniCluster();
+  }
+
+  @Test
+  public void testAbortTimeout() throws Exception {
+    Thread writer = new Thread(() -> {
+      try {
+        try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) {
+          for (int i = 0; i < 10000; i++) {
+            table.put(new Put(Bytes.toBytes(i)).addColumn(CF, CQ, Bytes.toBytes(i)));
+          }
+        }
+      } catch (IOException e) {
+        LOG.warn("Failed to load data");
+      }
+    });
+    writer.setDaemon(true);
+    writer.start();
+
+    // Abort one region server
+    UTIL.getMiniHBaseCluster().getRegionServer(0).abort("Abort RS for test");
+
+    long startTime = System.currentTimeMillis();
+    long timeout = REGIONS_NUM * SLEEP_TIME_WHEN_CLOSE_REGION * 10;
+    while (System.currentTimeMillis() - startTime < timeout) {
+      if (UTIL.getMiniHBaseCluster().getLiveRegionServerThreads().size() == 1) {
+        assertTrue("Abort timer task should be scheduled", abortTimeoutTaskScheduled);
+        return;
+      }
+      Threads.sleep(SLEEP_TIME_WHEN_CLOSE_REGION);
+    }
+    fail("Failed to abort a region server in " + timeout + " ms");
+  }
+
+  static class TestAbortTimeoutTask extends TimerTask {
+
+    public TestAbortTimeoutTask() {
+    }
+
+    @Override
+    public void run() {
+      LOG.info("TestAbortTimeoutTask was scheduled");
+      abortTimeoutTaskScheduled = true;
+    }
+  }
+
+  public static class SleepWhenCloseCoprocessor implements RegionCoprocessor, RegionObserver {
+
+    public SleepWhenCloseCoprocessor() {
+    }
+
+    @Override
+    public Optional<RegionObserver> getRegionObserver() {
+      return Optional.of(this);
+    }
+
+    @Override
+    public void preClose(ObserverContext<RegionCoprocessorEnvironment> c, boolean abortRequested)
+        throws IOException {
+      Threads.sleep(SLEEP_TIME_WHEN_CLOSE_REGION);
+    }
+  }
+}