You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by zh...@apache.org on 2018/08/24 02:08:34 UTC
hbase git commit: HBASE-21095 The timeout retry logic for several
procedures are broken after master restarts
Repository: hbase
Updated Branches:
refs/heads/master 780670ede -> aac1a7014
HBASE-21095 The timeout retry logic for several procedures are broken after master restarts
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/aac1a701
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/aac1a701
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/aac1a701
Branch: refs/heads/master
Commit: aac1a70147ec950c98db5f31d8906ce54547523a
Parents: 780670e
Author: zhangduo <zh...@apache.org>
Authored: Thu Aug 23 22:16:13 2018 +0800
Committer: Duo Zhang <zh...@apache.org>
Committed: Fri Aug 24 10:07:31 2018 +0800
----------------------------------------------------------------------
.../master/assignment/AssignmentManager.java | 67 +++++++++++++++-----
.../assignment/TransitRegionStateProcedure.java | 12 ++--
.../master/procedure/ServerCrashProcedure.java | 15 +++--
.../assignment/TestCloseRegionWhileRSCrash.java | 11 +++-
4 files changed, 75 insertions(+), 30 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hbase/blob/aac1a701/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index 9b020c8..a91f8e4 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -1414,12 +1414,25 @@ public class AssignmentManager implements ServerListener {
// Region Status update
// Should only be called in TransitRegionStateProcedure
// ============================================================================================
+ private void transitStateAndUpdate(RegionStateNode regionNode, RegionState.State newState,
+ RegionState.State... expectedStates) throws IOException {
+ RegionState.State state = regionNode.getState();
+ regionNode.transitionState(newState, expectedStates);
+ boolean succ = false;
+ try {
+ regionStateStore.updateRegionLocation(regionNode);
+ succ = true;
+ } finally {
+ if (!succ) {
+ // revert
+ regionNode.setState(state);
+ }
+ }
+ }
// should be called within the synchronized block of RegionStateNode
void regionOpening(RegionStateNode regionNode) throws IOException {
- regionNode.transitionState(State.OPENING, RegionStates.STATES_EXPECTED_ON_OPEN);
- regionStateStore.updateRegionLocation(regionNode);
-
+ transitStateAndUpdate(regionNode, State.OPENING, RegionStates.STATES_EXPECTED_ON_OPEN);
regionStates.addRegionToServer(regionNode);
// update the operation count metrics
metrics.incrementOperationCounter();
@@ -1429,23 +1442,33 @@ public class AssignmentManager implements ServerListener {
// The parameter 'giveUp' means whether we will try to open the region again, if it is true, then
// we will persist the FAILED_OPEN state into hbase:meta.
void regionFailedOpen(RegionStateNode regionNode, boolean giveUp) throws IOException {
- if (regionNode.getRegionLocation() != null) {
- regionStates.removeRegionFromServer(regionNode.getRegionLocation(), regionNode);
- }
+ RegionState.State state = regionNode.getState();
+ ServerName regionLocation = regionNode.getRegionLocation();
if (giveUp) {
regionNode.setState(State.FAILED_OPEN);
regionNode.setRegionLocation(null);
- regionStateStore.updateRegionLocation(regionNode);
+ boolean succ = false;
+ try {
+ regionStateStore.updateRegionLocation(regionNode);
+ succ = true;
+ } finally {
+ if (!succ) {
+ // revert
+ regionNode.setState(state);
+ regionNode.setRegionLocation(regionLocation);
+ }
+ }
+ }
+ if (regionLocation != null) {
+ regionStates.removeRegionFromServer(regionLocation, regionNode);
}
}
// should be called within the synchronized block of RegionStateNode
void regionOpened(RegionStateNode regionNode) throws IOException {
- regionNode.transitionState(State.OPEN, RegionStates.STATES_EXPECTED_ON_OPEN);
// TODO: OPENING Updates hbase:meta too... we need to do both here and there?
// That is a lot of hbase:meta writing.
- regionStateStore.updateRegionLocation(regionNode);
-
+ transitStateAndUpdate(regionNode, State.OPEN, RegionStates.STATES_EXPECTED_ON_OPEN);
RegionInfo hri = regionNode.getRegionInfo();
if (isMetaRegion(hri)) {
// Usually we'd set a table ENABLED at this stage but hbase:meta is ALWAYs enabled, it
@@ -1460,7 +1483,7 @@ public class AssignmentManager implements ServerListener {
// should be called within the synchronized block of RegionStateNode
void regionClosing(RegionStateNode regionNode) throws IOException {
- regionNode.transitionState(State.CLOSING, RegionStates.STATES_EXPECTED_ON_CLOSE);
+ transitStateAndUpdate(regionNode, State.CLOSING, RegionStates.STATES_EXPECTED_ON_CLOSE);
regionStateStore.updateRegionLocation(regionNode);
RegionInfo hri = regionNode.getRegionInfo();
@@ -1477,16 +1500,26 @@ public class AssignmentManager implements ServerListener {
// The parameter 'normally' means whether we are closed cleanly, if it is true, then it means that
// we are closed due to a RS crash.
void regionClosed(RegionStateNode regionNode, boolean normally) throws IOException {
+ RegionState.State state = regionNode.getState();
+ ServerName regionLocation = regionNode.getRegionLocation();
regionNode.transitionState(normally ? State.CLOSED : State.ABNORMALLY_CLOSED,
RegionStates.STATES_EXPECTED_ON_CLOSE);
- ServerName loc = regionNode.getRegionLocation();
- if (loc != null) {
- // could be a retry so add a check here to avoid set the lastHost to null.
- regionNode.setLastHost(loc);
+ boolean succ = false;
+ try {
+ regionStateStore.updateRegionLocation(regionNode);
+ succ = true;
+ } finally {
+ if (!succ) {
+ // revert
+ regionNode.setState(state);
+ regionNode.setRegionLocation(regionLocation);
+ }
+ }
+ if (regionLocation != null) {
+ regionNode.setLastHost(regionLocation);
regionNode.setRegionLocation(null);
- regionStates.removeRegionFromServer(loc, regionNode);
+ regionStates.removeRegionFromServer(regionLocation, regionNode);
}
- regionStateStore.updateRegionLocation(regionNode);
}
public void markRegionAsSplit(final RegionInfo parent, final ServerName serverName,
http://git-wip-us.apache.org/repos/asf/hbase/blob/aac1a701/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
index e853b9b..0d2c4b2 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
@@ -326,13 +326,9 @@ public class TransitRegionStateProcedure
"Failed transition, suspend {}secs {}; {}; waiting on rectified condition fixed " +
"by other Procedure or operator intervention",
backoff / 1000, this, regionNode.toShortString(), e);
- regionNode.getProcedureEvent().suspend();
- if (regionNode.getProcedureEvent().suspendIfNotReady(this)) {
- setTimeout(Math.toIntExact(backoff));
- setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
- throw new ProcedureSuspendedException();
- }
- return Flow.HAS_MORE_STATE;
+ setTimeout(Math.toIntExact(backoff));
+ setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
+ throw new ProcedureSuspendedException();
}
}
@@ -342,7 +338,7 @@ public class TransitRegionStateProcedure
@Override
protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
setState(ProcedureProtos.ProcedureState.RUNNABLE);
- getRegionStateNode(env).getProcedureEvent().wake(env.getProcedureScheduler());
+ env.getProcedureScheduler().addFront(this);
return false; // 'false' means that this procedure handled the timeout
}
http://git-wip-us.apache.org/repos/asf/hbase/blob/aac1a701/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
index db7a872..1fcc6eb 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
@@ -114,6 +114,17 @@ public class ServerCrashProcedure
notifiedDeadServer = true;
}
+ switch (state) {
+ case SERVER_CRASH_START:
+ case SERVER_CRASH_SPLIT_META_LOGS:
+ case SERVER_CRASH_ASSIGN_META:
+ break;
+ default:
+ // If hbase:meta is not assigned, yield.
+ if (env.getAssignmentManager().waitMetaLoaded(this)) {
+ throw new ProcedureSuspendedException();
+ }
+ }
try {
switch (state) {
case SERVER_CRASH_START:
@@ -134,10 +145,6 @@ public class ServerCrashProcedure
setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
break;
case SERVER_CRASH_GET_REGIONS:
- // If hbase:meta is not assigned, yield.
- if (env.getAssignmentManager().waitMetaLoaded(this)) {
- throw new ProcedureSuspendedException();
- }
this.regionsOnCrashedServer =
services.getAssignmentManager().getRegionStates().getServerRegionInfoSet(serverName);
// Where to go next? Depends on whether we should split logs at all or
http://git-wip-us.apache.org/repos/asf/hbase/blob/aac1a701/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java
index 9b1f4ca..3573bd6 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java
@@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface;
@@ -149,6 +150,7 @@ public class TestCloseRegionWhileRSCrash {
@BeforeClass
public static void setUp() throws Exception {
+ UTIL.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
UTIL.startMiniCluster(3);
UTIL.createTable(TABLE_NAME, CF);
UTIL.getAdmin().balancerSwitch(false, true);
@@ -174,7 +176,7 @@ public class TestCloseRegionWhileRSCrash {
HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs);
ProcedureExecutor<MasterProcedureEnv> procExec =
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
- procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName()));
+ long dummyProcId = procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName()));
ARRIVE.await();
UTIL.getMiniHBaseCluster().killRegionServer(srcRs.getServerName());
UTIL.waitFor(30000,
@@ -206,7 +208,14 @@ public class TestCloseRegionWhileRSCrash {
}
Thread.sleep(1000);
}
+ // let's close the connection to make sure that the SCP can not update meta successfully
+ UTIL.getMiniHBaseCluster().getMaster().getConnection().close();
RESUME.countDown();
+ UTIL.waitFor(30000, () -> procExec.isFinished(dummyProcId));
+ Thread.sleep(2000);
+ // here we restart
+ UTIL.getMiniHBaseCluster().stopMaster(0).join();
+ UTIL.getMiniHBaseCluster().startMaster();
t.join();
// Make sure that the region is online, it may not on the original target server, as we will set
// forceNewPlan to true if there is a server crash