You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ap...@apache.org on 2019/01/12 02:01:21 UTC
[hbase] branch branch-1 updated: HBASE-21325 Force to terminate
regionserver when abort hang in somewhere
This is an automated email from the ASF dual-hosted git repository.
apurtell pushed a commit to branch branch-1
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-1 by this push:
new e7ff91f HBASE-21325 Force to terminate regionserver when abort hang in somewhere
e7ff91f is described below
commit e7ff91f35ee9fbf9f66ef6e999a8155368d91753
Author: Guanghao Zhang <zg...@apache.org>
AuthorDate: Fri Oct 19 19:34:04 2018 +0800
HBASE-21325 Force to terminate regionserver when abort hang in somewhere
Conflicts:
hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java
Amending-Author: Andrew Purtell <ap...@apache.org>
---
.../hadoop/hbase/regionserver/HRegionServer.java | 39 +++++-
.../regionserver/TestRegionServerAbortTimeout.java | 137 +++++++++++++++++++++
2 files changed, 175 insertions(+), 1 deletion(-)
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index 6e5ce80..5f250c0 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -51,6 +51,8 @@ import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
+import java.util.Timer;
+import java.util.TimerTask;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
@@ -312,6 +314,11 @@ public class HRegionServer extends HasThread implements
// Go down hard. Used if file system becomes unavailable and also in
// debugging and unit tests.
private volatile boolean abortRequested;
+ public static final String ABORT_TIMEOUT = "hbase.regionserver.abort.timeout";
+ // Default abort timeout is 1200 seconds for safe
+ private static final long DEFAULT_ABORT_TIMEOUT = 1200000;
+ // Will run this task when abort timeout
+ public static final String ABORT_TIMEOUT_TASK = "hbase.regionserver.abort.timeout.task";
ConcurrentMap<String, Integer> rowlocks = new ConcurrentHashMap<String, Integer>();
@@ -1041,12 +1048,31 @@ public class HRegionServer extends HasThread implements
abort(prefix + t.getMessage(), t);
}
}
+
// Run shutdown.
if (mxBean != null) {
MBeanUtil.unregisterMBean(mxBean);
mxBean = null;
}
- if (this.leases != null) this.leases.closeAfterLeasesExpire();
+
+ if (abortRequested) {
+ Timer abortMonitor = new Timer("Abort regionserver monitor", true);
+ TimerTask abortTimeoutTask = null;
+ try {
+ abortTimeoutTask =
+ Class.forName(conf.get(ABORT_TIMEOUT_TASK, SystemExitWhenAbortTimeout.class.getName()))
+ .asSubclass(TimerTask.class).getDeclaredConstructor().newInstance();
+ } catch (Exception e) {
+ LOG.warn("Initialize abort timeout task failed", e);
+ }
+ if (abortTimeoutTask != null) {
+ abortMonitor.schedule(abortTimeoutTask, conf.getLong(ABORT_TIMEOUT, DEFAULT_ABORT_TIMEOUT));
+ }
+ }
+
+ if (this.leases != null) {
+ this.leases.closeAfterLeasesExpire();
+ }
if (this.splitLogWorker != null) {
splitLogWorker.stop();
}
@@ -3552,4 +3578,15 @@ public class HRegionServer extends HasThread implements
public void unassign(byte[] regionName) throws IOException {
clusterConnection.getAdmin().unassign(regionName, false);
}
+
+ /**
+ * Force to terminate region server when abort timeout.
+ */
+ private static class SystemExitWhenAbortTimeout extends TimerTask {
+ @Override
+ public void run() {
+ LOG.warn("Aborting region server timed out, terminate forcibly...");
+ System.exit(1);
+ }
+ }
}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java
new file mode 100644
index 0000000..ed129c5
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerAbortTimeout.java
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.regionserver;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.util.TimerTask;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HColumnDescriptor;
+import org.apache.hadoop.hbase.HTableDescriptor;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver;
+import org.apache.hadoop.hbase.coprocessor.ObserverContext;
+import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.testclassification.RegionServerTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.Threads;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Category({ RegionServerTests.class, MediumTests.class })
+public class TestRegionServerAbortTimeout {
+ private static final Logger LOG = LoggerFactory.getLogger(TestRegionServerAbortTimeout.class);
+
+ private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
+
+ private static TableName TABLE_NAME = TableName.valueOf("RSAbort");
+
+ private static byte[] CF = Bytes.toBytes("cf");
+
+ private static byte[] CQ = Bytes.toBytes("cq");
+
+ private static final int REGIONS_NUM = 5;
+
+ private static final int SLEEP_TIME_WHEN_CLOSE_REGION = 1000;
+
+ private static volatile boolean abortTimeoutTaskScheduled = false;
+
+ @BeforeClass
+ public static void setUp() throws Exception {
+ Configuration conf = UTIL.getConfiguration();
+ // Will schedule a abort timeout task after SLEEP_TIME_WHEN_CLOSE_REGION ms
+ conf.setLong(HRegionServer.ABORT_TIMEOUT, SLEEP_TIME_WHEN_CLOSE_REGION);
+ conf.set(HRegionServer.ABORT_TIMEOUT_TASK, TestAbortTimeoutTask.class.getName());
+ UTIL.startMiniCluster(2);
+ HTableDescriptor td = new HTableDescriptor(TABLE_NAME);
+ td.addCoprocessor(SleepWhenCloseCoprocessor.class.getName());
+ td.addFamily(new HColumnDescriptor(CF));
+ UTIL.getHBaseAdmin().createTable(td, Bytes.toBytes("0"), Bytes.toBytes("9"), REGIONS_NUM);
+ }
+
+ @AfterClass
+ public static void tearDown() throws Exception {
+ UTIL.getHBaseAdmin().disableTable(TABLE_NAME);
+ UTIL.getHBaseAdmin().deleteTable(TABLE_NAME);
+ UTIL.shutdownMiniCluster();
+ }
+
+ @Test
+ public void testAbortTimeout() throws Exception {
+ Thread writer = new Thread() {
+ public void run() {
+ try {
+ try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) {
+ for (int i = 0; i < 10000; i++) {
+ table.put(new Put(Bytes.toBytes(i)).addColumn(CF, CQ, Bytes.toBytes(i)));
+ }
+ }
+ } catch (IOException e) {
+ LOG.warn("Failed to load data");
+ }
+ }
+ };
+ writer.setDaemon(true);
+ writer.start();
+
+ // Abort one region server
+ UTIL.getMiniHBaseCluster().getRegionServer(0).abort("Abort RS for test");
+
+ long startTime = System.currentTimeMillis();
+ long timeout = REGIONS_NUM * SLEEP_TIME_WHEN_CLOSE_REGION * 10;
+ while (System.currentTimeMillis() - startTime < timeout) {
+ if (UTIL.getMiniHBaseCluster().getLiveRegionServerThreads().size() == 1) {
+ assertTrue("Abort timer task should be scheduled", abortTimeoutTaskScheduled);
+ return;
+ }
+ Threads.sleep(SLEEP_TIME_WHEN_CLOSE_REGION);
+ }
+ fail("Failed to abort a region server in " + timeout + " ms");
+ }
+
+ static class TestAbortTimeoutTask extends TimerTask {
+
+ public TestAbortTimeoutTask() {
+ }
+
+ @Override
+ public void run() {
+ LOG.info("TestAbortTimeoutTask was scheduled");
+ abortTimeoutTaskScheduled = true;
+ }
+ }
+
+ public static class SleepWhenCloseCoprocessor extends BaseRegionObserver {
+ @Override
+ public void preClose(ObserverContext<RegionCoprocessorEnvironment> c, boolean abortRequested)
+ throws IOException {
+ Threads.sleep(SLEEP_TIME_WHEN_CLOSE_REGION);
+ }
+ }
+}