You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by zh...@apache.org on 2019/05/13 08:17:42 UTC
[hbase] branch branch-2 updated: HBASE-22365 Region may be opened
on two RegionServers
This is an automated email from the ASF dual-hosted git repository.
zhangduo pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2 by this push:
new 4b4c3ba HBASE-22365 Region may be opened on two RegionServers
4b4c3ba is described below
commit 4b4c3ba6e7f221702c2ed11a902819158319b3b1
Author: zhangduo <zh...@apache.org>
AuthorDate: Fri May 10 20:50:59 2019 +0800
HBASE-22365 Region may be opened on two RegionServers
---
.../hbase/master/assignment/AssignmentManager.java | 101 ++++++----
.../master/assignment/CloseRegionProcedure.java | 22 ++-
.../master/assignment/OpenRegionProcedure.java | 64 +++---
.../assignment/RegionRemoteProcedureBase.java | 49 +++--
.../assignment/TransitRegionStateProcedure.java | 11 +-
.../master/assignment/TestSCPGetRegionsRace.java | 218 +++++++++++++++++++++
6 files changed, 381 insertions(+), 84 deletions(-)
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index c2af566..1dc6c18 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -213,6 +213,9 @@ public class AssignmentManager {
try {
regionNode.setRegionLocation(regionState.getServerName());
regionNode.setState(regionState.getState());
+ if (regionNode.getProcedure() != null) {
+ regionNode.getProcedure().stateLoaded(this, regionNode);
+ }
setMetaAssigned(regionState.getRegion(), regionState.getState() == State.OPEN);
} finally {
regionNode.unlock();
@@ -235,14 +238,12 @@ public class AssignmentManager {
TransitRegionStateProcedure existingProc = regionNode.getProcedure();
if (existingProc != null) {
// This is possible, as we will detach the procedure from the RSN before we
- // actually finish the procedure. This is because that, we will update the region state
- // directly in the reportTransition method for TRSP, and theoretically the region transition
- // has been done, so we need to detach the procedure from the RSN. But actually the
- // procedure has not been marked as done in the pv2 framework yet, so it is possible that we
- // schedule a new TRSP immediately and when arriving here, we will find out that there are
- // multiple TRSPs for the region. But we can make sure that, only the last one can take the
- // charge, the previous ones should have all been finished already.
- // So here we will compare the proc id, the greater one will win.
+ // actually finish the procedure. This is because that, we will detach the TRSP from the RSN
+ // during execution, at that time, the procedure has not been marked as done in the pv2
+ // framework yet, so it is possible that we schedule a new TRSP immediately and when
+ // arriving here, we will find out that there are multiple TRSPs for the region. But we can
+ // make sure that, only the last one can take the charge, the previous ones should have all
+ // been finished already. So here we will compare the proc id, the greater one will win.
if (existingProc.getProcId() < proc.getProcId()) {
// the new one wins, unset and set it to the new one below
regionNode.unsetProcedure(existingProc);
@@ -1301,7 +1302,6 @@ public class AssignmentManager {
// In any of these cases, state is empty. For now, presume OFFLINE but there are probably
// cases where we need to probe more to be sure this correct; TODO informed by experience.
LOG.info(regionInfo.getEncodedName() + " regionState=null; presuming " + State.OFFLINE);
-
localState = State.OFFLINE;
}
RegionStateNode regionNode = regionStates.getOrCreateRegionStateNode(regionInfo);
@@ -1322,6 +1322,9 @@ public class AssignmentManager {
} else if (localState == State.OFFLINE || regionInfo.isOffline()) {
regionStates.addToOfflineRegions(regionNode);
}
+ if (regionNode.getProcedure() != null) {
+ regionNode.getProcedure().stateLoaded(AssignmentManager.this, regionNode);
+ }
}
});
@@ -1482,8 +1485,9 @@ public class AssignmentManager {
private static final State[] STATES_EXPECTED_ON_UNASSIGN_OR_MOVE = { State.OPEN };
// ============================================================================================
- // Region Status update
- // Should only be called in TransitRegionStateProcedure
+ // Region Status update
+ // Should only be called in TransitRegionStateProcedure(and related procedures), as the locking
+ // and pre-assumptions are very tricky.
// ============================================================================================
private void transitStateAndUpdate(RegionStateNode regionNode, RegionState.State newState,
RegionState.State... expectedStates) throws IOException {
@@ -1512,7 +1516,7 @@ public class AssignmentManager {
metrics.incrementOperationCounter();
}
- // should be called within the synchronized block of RegionStateNode.
+ // should be called under the RegionStateNode lock
// The parameter 'giveUp' means whether we will try to open the region again, if it is true, then
// we will persist the FAILED_OPEN state into hbase:meta.
void regionFailedOpen(RegionStateNode regionNode, boolean giveUp) throws IOException {
@@ -1538,24 +1542,7 @@ public class AssignmentManager {
}
}
- // should be called within the synchronized block of RegionStateNode
- void regionOpened(RegionStateNode regionNode) throws IOException {
- // TODO: OPENING Updates hbase:meta too... we need to do both here and there?
- // That is a lot of hbase:meta writing.
- transitStateAndUpdate(regionNode, State.OPEN, STATES_EXPECTED_ON_OPEN);
- RegionInfo hri = regionNode.getRegionInfo();
- if (isMetaRegion(hri)) {
- // Usually we'd set a table ENABLED at this stage but hbase:meta is ALWAYs enabled, it
- // can't be disabled -- so skip the RPC (besides... enabled is managed by TableStateManager
- // which is backed by hbase:meta... Avoid setting ENABLED to avoid having to update state
- // on table that contains state.
- setMetaAssigned(hri, true);
- }
- regionStates.addRegionToServer(regionNode);
- regionStates.removeFromFailedOpen(hri);
- }
-
- // should be called within the synchronized block of RegionStateNode
+ // should be called under the RegionStateNode lock
void regionClosing(RegionStateNode regionNode) throws IOException {
transitStateAndUpdate(regionNode, State.CLOSING, STATES_EXPECTED_ON_CLOSING);
@@ -1569,18 +1556,36 @@ public class AssignmentManager {
metrics.incrementOperationCounter();
}
- // should be called within the synchronized block of RegionStateNode
- // The parameter 'normally' means whether we are closed cleanly, if it is true, then it means that
- // we are closed due to a RS crash.
- void regionClosed(RegionStateNode regionNode, boolean normally) throws IOException {
- RegionState.State state = regionNode.getState();
+ // for open and close, they will first be persist to the procedure store in
+ // RegionRemoteProcedureBase. So here we will first change the in memory state as it is considered
+ // as succeeded if the persistence to procedure store is succeeded, and then when the
+ // RegionRemoteProcedureBase is woken up, we will persist the RegionStateNode to hbase:meta.
+
+ // should be called under the RegionStateNode lock
+ void regionOpenedWithoutPersistingToMeta(RegionStateNode regionNode) throws IOException {
+ regionNode.transitionState(State.OPEN, STATES_EXPECTED_ON_OPEN);
+ RegionInfo regionInfo = regionNode.getRegionInfo();
+ regionStates.addRegionToServer(regionNode);
+ regionStates.removeFromFailedOpen(regionInfo);
+ }
+
+ // should be called under the RegionStateNode lock
+ void regionClosedWithoutPersistingToMeta(RegionStateNode regionNode) throws IOException {
ServerName regionLocation = regionNode.getRegionLocation();
- if (normally) {
- regionNode.transitionState(State.CLOSED, STATES_EXPECTED_ON_CLOSED);
- } else {
- // For SCP
- regionNode.transitionState(State.ABNORMALLY_CLOSED);
+ regionNode.transitionState(State.CLOSED, STATES_EXPECTED_ON_CLOSED);
+ regionNode.setRegionLocation(null);
+ if (regionLocation != null) {
+ regionNode.setLastHost(regionLocation);
+ regionStates.removeRegionFromServer(regionLocation, regionNode);
}
+ }
+
+ // should be called under the RegionStateNode lock
+ // for SCP
+ void regionClosedAbnormally(RegionStateNode regionNode) throws IOException {
+ RegionState.State state = regionNode.getState();
+ ServerName regionLocation = regionNode.getRegionLocation();
+ regionNode.transitionState(State.ABNORMALLY_CLOSED);
regionNode.setRegionLocation(null);
boolean succ = false;
try {
@@ -1599,6 +1604,22 @@ public class AssignmentManager {
}
}
+ void persistToMeta(RegionStateNode regionNode) throws IOException {
+ regionStateStore.updateRegionLocation(regionNode);
+ RegionInfo regionInfo = regionNode.getRegionInfo();
+ if (isMetaRegion(regionInfo) && regionNode.getState() == State.OPEN) {
+ // Usually we'd set a table ENABLED at this stage but hbase:meta is ALWAYs enabled, it
+ // can't be disabled -- so skip the RPC (besides... enabled is managed by TableStateManager
+ // which is backed by hbase:meta... Avoid setting ENABLED to avoid having to update state
+ // on table that contains state.
+ setMetaAssigned(regionInfo, true);
+ }
+ }
+
+ // ============================================================================================
+ // The above methods can only be called in TransitRegionStateProcedure(and related procedures)
+ // ============================================================================================
+
public void markRegionAsSplit(final RegionInfo parent, final ServerName serverName,
final RegionInfo daughterA, final RegionInfo daughterB) throws IOException {
// Update hbase:meta. Parent will be marked offline and split up in hbase:meta.
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java
index 4fd60f0..af31966 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
+import org.apache.hadoop.hbase.master.RegionState.State;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionCloseOperation;
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
@@ -90,8 +91,8 @@ public class CloseRegionProcedure extends RegionRemoteProcedureBase {
}
@Override
- protected void reportTransition(RegionStateNode regionNode, TransitionCode transitionCode,
- long seqId) throws IOException {
+ protected void checkTransition(RegionStateNode regionNode, TransitionCode transitionCode,
+ long seqId) throws UnexpectedStateException {
if (transitionCode != TransitionCode.CLOSED) {
throw new UnexpectedStateException("Received report unexpected " + transitionCode +
" transition, " + regionNode.toShortString() + ", " + this + ", expected CLOSED.");
@@ -99,8 +100,19 @@ public class CloseRegionProcedure extends RegionRemoteProcedureBase {
}
@Override
- protected void updateTransition(MasterProcedureEnv env, RegionStateNode regionNode,
- TransitionCode transitionCode, long seqId) throws IOException {
- env.getAssignmentManager().regionClosed(regionNode, true);
+ protected void updateTransitionWithoutPersistingToMeta(MasterProcedureEnv env,
+ RegionStateNode regionNode, TransitionCode transitionCode, long seqId) throws IOException {
+ assert transitionCode == TransitionCode.CLOSED;
+ env.getAssignmentManager().regionClosedWithoutPersistingToMeta(regionNode);
+ }
+
+ @Override
+ protected void restoreSucceedState(AssignmentManager am, RegionStateNode regionNode, long seqId)
+ throws IOException {
+ if (regionNode.getState() == State.CLOSED) {
+ // should have already been persisted, ignore
+ return;
+ }
+ am.regionClosedWithoutPersistingToMeta(regionNode);
}
}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java
index 579b757..4f7fcef 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
+import org.apache.hadoop.hbase.master.RegionState.State;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionOpenOperation;
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
@@ -77,19 +78,31 @@ public class OpenRegionProcedure extends RegionRemoteProcedureBase {
return env.getAssignmentManager().getAssignmentManagerMetrics().getOpenProcMetrics();
}
+ private void regionOpenedWithoutPersistingToMeta(AssignmentManager am, RegionStateNode regionNode,
+ TransitionCode transitionCode, long openSeqNum) throws IOException {
+ if (openSeqNum < regionNode.getOpenSeqNum()) {
+ LOG.warn(
+ "Received report {} transition from {} for {}, pid={} but the new openSeqNum {}" +
+ " is less than the current one {}, ignoring...",
+ transitionCode, targetServer, regionNode, getProcId(), openSeqNum,
+ regionNode.getOpenSeqNum());
+ } else {
+ regionNode.setOpenSeqNum(openSeqNum);
+ }
+ am.regionOpenedWithoutPersistingToMeta(regionNode);
+ }
+
@Override
- protected void reportTransition(RegionStateNode regionNode, TransitionCode transitionCode,
- long seqId) throws IOException {
+ protected void checkTransition(RegionStateNode regionNode, TransitionCode transitionCode,
+ long openSeqNum) throws UnexpectedStateException {
switch (transitionCode) {
case OPENED:
- // this is the openSeqNum
- if (seqId < 0) {
+ if (openSeqNum < 0) {
throw new UnexpectedStateException("Received report unexpected " + TransitionCode.OPENED +
- " transition openSeqNum=" + seqId + ", " + regionNode + ", proc=" + this);
+ " transition openSeqNum=" + openSeqNum + ", " + regionNode + ", proc=" + this);
}
break;
case FAILED_OPEN:
- // nothing to check
break;
default:
throw new UnexpectedStateException(
@@ -99,27 +112,26 @@ public class OpenRegionProcedure extends RegionRemoteProcedureBase {
}
@Override
- protected void updateTransition(MasterProcedureEnv env, RegionStateNode regionNode,
- TransitionCode transitionCode, long openSeqNum) throws IOException {
- switch (transitionCode) {
- case OPENED:
- if (openSeqNum < regionNode.getOpenSeqNum()) {
- LOG.warn(
- "Received report {} transition from {} for {}, pid={} but the new openSeqNum {}" +
- " is less than the current one {}, ignoring...",
- transitionCode, targetServer, regionNode, getProcId(), openSeqNum,
- regionNode.getOpenSeqNum());
- } else {
- regionNode.setOpenSeqNum(openSeqNum);
- }
- env.getAssignmentManager().regionOpened(regionNode);
- break;
- case FAILED_OPEN:
- env.getAssignmentManager().regionFailedOpen(regionNode, false);
- break;
- default:
- throw new UnexpectedStateException("Unexpected transition code: " + transitionCode);
+ protected void updateTransitionWithoutPersistingToMeta(MasterProcedureEnv env,
+ RegionStateNode regionNode, TransitionCode transitionCode, long openSeqNum)
+ throws IOException {
+ if (transitionCode == TransitionCode.OPENED) {
+ regionOpenedWithoutPersistingToMeta(env.getAssignmentManager(), regionNode, transitionCode,
+ openSeqNum);
+ } else {
+ assert transitionCode == TransitionCode.FAILED_OPEN;
+ // will not persist to meta if giveUp is false
+ env.getAssignmentManager().regionFailedOpen(regionNode, false);
}
+ }
+ @Override
+ protected void restoreSucceedState(AssignmentManager am, RegionStateNode regionNode,
+ long openSeqNum) throws IOException {
+ if (regionNode.getState() == State.OPEN) {
+ // should have already been persisted, ignore
+ return;
+ }
+ regionOpenedWithoutPersistingToMeta(am, regionNode, TransitionCode.OPENED, openSeqNum);
}
}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java
index 9377d89..fa00958 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java
@@ -153,9 +153,13 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur
return false;
}
- // do some checks to see if the report is valid, without actually updating meta.
- protected abstract void reportTransition(RegionStateNode regionNode,
- TransitionCode transitionCode, long seqId) throws IOException;
+ // do some checks to see if the report is valid
+ protected abstract void checkTransition(RegionStateNode regionNode, TransitionCode transitionCode,
+ long seqId) throws UnexpectedStateException;
+
+ // change the in memory state of the regionNode, but do not update meta.
+ protected abstract void updateTransitionWithoutPersistingToMeta(MasterProcedureEnv env,
+ RegionStateNode regionNode, TransitionCode transitionCode, long seqId) throws IOException;
// A bit strange but the procedure store will throw RuntimeException if we can not persist the
// state, so upper layer should take care of this...
@@ -175,7 +179,7 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur
throw new UnexpectedStateException("Received report from " + serverName + ", expected " +
targetServer + ", " + regionNode + ", proc=" + this);
}
- reportTransition(regionNode, transitionCode, seqId);
+ checkTransition(regionNode, transitionCode, seqId);
// this state means we have received the report from RS, does not mean the result is fine, as we
// may received a FAILED_OPEN.
this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED;
@@ -196,13 +200,24 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur
this.seqId = HConstants.NO_SEQNUM;
}
}
+ try {
+ updateTransitionWithoutPersistingToMeta(env, regionNode, transitionCode, seqId);
+ } catch (IOException e) {
+ throw new AssertionError("should not happen", e);
+ }
}
void serverCrashed(MasterProcedureEnv env, RegionStateNode regionNode, ServerName serverName) {
- if (state != RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH) {
+ if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_SERVER_CRASH) {
// should be a retry
return;
}
+ RegionRemoteProcedureBaseState oldState = state;
+ // it is possible that the state is in REGION_REMOTE_PROCEDURE_SERVER_CRASH, think of this
+ // sequence
+ // 1. region is open on the target server and the above reportTransition call is succeeded
+ // 2. before we are woken up and update the meta, the target server crashes, and then we arrive
+ // here
this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_SERVER_CRASH;
boolean succ = false;
try {
@@ -210,7 +225,21 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur
succ = true;
} finally {
if (!succ) {
- this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH;
+ this.state = oldState;
+ }
+ }
+ }
+
+ protected abstract void restoreSucceedState(AssignmentManager am, RegionStateNode regionNode,
+ long seqId) throws IOException;
+
+ void stateLoaded(AssignmentManager am, RegionStateNode regionNode) {
+ if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED) {
+ try {
+ restoreSucceedState(am, regionNode, seqId);
+ } catch (IOException e) {
+ // should not happen as we are just restoring the state
+ throw new AssertionError(e);
}
}
}
@@ -224,10 +253,6 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur
getParent(env).unattachRemoteProc(this);
}
- // actually update the state to meta
- protected abstract void updateTransition(MasterProcedureEnv env, RegionStateNode regionNode,
- TransitionCode transitionCode, long seqId) throws IOException;
-
@Override
protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env)
throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
@@ -254,7 +279,7 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur
throw new ProcedureSuspendedException();
}
case REGION_REMOTE_PROCEDURE_REPORT_SUCCEED:
- updateTransition(env, regionNode, transitionCode, seqId);
+ env.getAssignmentManager().persistToMeta(regionNode);
unattach(env);
return null;
case REGION_REMOTE_PROCEDURE_DISPATCH_FAIL:
@@ -262,7 +287,7 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur
unattach(env);
return null;
case REGION_REMOTE_PROCEDURE_SERVER_CRASH:
- env.getAssignmentManager().regionClosed(regionNode, false);
+ env.getAssignmentManager().regionClosedAbnormally(regionNode);
unattach(env);
return null;
default:
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
index 9e017fd..6bf5dc2 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
@@ -403,7 +403,7 @@ public class TransitRegionStateProcedure
remoteProc.serverCrashed(env, regionNode, serverName);
} else {
// we are in RUNNING state, just update the region state, and we will process it later.
- env.getAssignmentManager().regionClosed(regionNode, false);
+ env.getAssignmentManager().regionClosedAbnormally(regionNode);
}
}
@@ -416,6 +416,15 @@ public class TransitRegionStateProcedure
this.remoteProc = null;
}
+ // will be called after we finish loading the meta entry for this region.
+ // used to change the state of the region node if we have a sub procedure, as we may not persist
+ // the state to meta yet. See the code in RegionRemoteProcedureBase.execute for more details.
+ void stateLoaded(AssignmentManager am, RegionStateNode regionNode) {
+ if (remoteProc != null) {
+ remoteProc.stateLoaded(am, regionNode);
+ }
+ }
+
@Override
protected void rollbackState(MasterProcedureEnv env, RegionStateTransitionState state)
throws IOException, InterruptedException {
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestSCPGetRegionsRace.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestSCPGetRegionsRace.java
new file mode 100644
index 0000000..cbbdbdc
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestSCPGetRegionsRace.java
@@ -0,0 +1,218 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.assignment;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.Future;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.PleaseHoldException;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.StartMiniClusterOption;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.master.HMaster;
+import org.apache.hadoop.hbase.master.MasterServices;
+import org.apache.hadoop.hbase.master.RegionPlan;
+import org.apache.hadoop.hbase.master.ServerManager;
+import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
+import org.apache.hadoop.hbase.regionserver.HRegionServer;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.IdLock;
+import org.apache.zookeeper.KeeperException;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import org.apache.hbase.thirdparty.com.google.common.collect.Iterables;
+
+import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
+import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
+import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse;
+
+/**
+ * Testcase for HBASE-22365.
+ */
+@Category({ MasterTests.class, MediumTests.class })
+public class TestSCPGetRegionsRace {
+
+ @ClassRule
+ public static final HBaseClassTestRule CLASS_RULE =
+ HBaseClassTestRule.forClass(TestSCPGetRegionsRace.class);
+
+ private static final List<ServerName> EXCLUDE_SERVERS = new ArrayList<>();
+
+ private static final class ServerManagerForTest extends ServerManager {
+
+ public ServerManagerForTest(MasterServices master) {
+ super(master);
+ }
+
+ @Override
+ public List<ServerName> createDestinationServersList() {
+ return super.createDestinationServersList(EXCLUDE_SERVERS);
+ }
+ }
+
+ private static CountDownLatch ARRIVE_REPORT;
+
+ private static CountDownLatch RESUME_REPORT;
+
+ private static CountDownLatch ARRIVE_GET;
+
+ private static CountDownLatch RESUME_GET;
+
+ private static final class AssignmentManagerForTest extends AssignmentManager {
+
+ public AssignmentManagerForTest(MasterServices master) {
+ super(master);
+ }
+
+ @Override
+ public ReportRegionStateTransitionResponse reportRegionStateTransition(
+ ReportRegionStateTransitionRequest req) throws PleaseHoldException {
+ if (req.getTransition(0).getTransitionCode() == TransitionCode.CLOSED) {
+ if (ARRIVE_REPORT != null) {
+ ARRIVE_REPORT.countDown();
+ try {
+ RESUME_REPORT.await();
+ RESUME_REPORT = null;
+ } catch (InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+ return super.reportRegionStateTransition(req);
+ }
+
+ @Override
+ public List<RegionInfo> getRegionsOnServer(ServerName serverName) {
+ List<RegionInfo> regions = super.getRegionsOnServer(serverName);
+ if (ARRIVE_GET != null) {
+ ARRIVE_GET.countDown();
+ try {
+ RESUME_GET.await();
+ RESUME_GET = null;
+ } catch (InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ return regions;
+ }
+
+ }
+
+ public static final class HMasterForTest extends HMaster {
+
+ public HMasterForTest(Configuration conf) throws IOException, KeeperException {
+ super(conf);
+ }
+
+ @Override
+ protected AssignmentManager createAssignmentManager(MasterServices master) {
+ return new AssignmentManagerForTest(master);
+ }
+
+ @Override
+ protected ServerManager createServerManager(MasterServices master) throws IOException {
+ setupClusterConnection();
+ return new ServerManagerForTest(master);
+ }
+ }
+
+ private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
+
+ private static TableName NAME = TableName.valueOf("Assign");
+
+ private static byte[] CF = Bytes.toBytes("cf");
+
+ @BeforeClass
+ public static void setUp() throws Exception {
+ UTIL.startMiniCluster(StartMiniClusterOption.builder().masterClass(HMasterForTest.class)
+ .numMasters(1).numRegionServers(3).build());
+ UTIL.createTable(NAME, CF);
+ UTIL.waitTableAvailable(NAME);
+ UTIL.getAdmin().balancerSwitch(false, true);
+ }
+
+ @AfterClass
+ public static void tearDown() throws Exception {
+ UTIL.shutdownMiniCluster();
+ }
+
+ @Test
+ public void test() throws Exception {
+ RegionInfo region =
+ Iterables.getOnlyElement(UTIL.getMiniHBaseCluster().getRegions(NAME)).getRegionInfo();
+ HMaster master = UTIL.getMiniHBaseCluster().getMaster();
+ AssignmentManager am = master.getAssignmentManager();
+ RegionStateNode rsn = am.getRegionStates().getRegionStateNode(region);
+ ServerName source = rsn.getRegionLocation();
+ ServerName dest =
+ UTIL.getAdmin().getRegionServers().stream().filter(sn -> !sn.equals(source)).findAny().get();
+
+ ARRIVE_REPORT = new CountDownLatch(1);
+ RESUME_REPORT = new CountDownLatch(1);
+
+ Future<?> future = am.moveAsync(new RegionPlan(region, source, dest));
+
+ ARRIVE_REPORT.await();
+ ARRIVE_REPORT = null;
+ // let's get procedure lock to stop the TRSP
+ IdLock procExecutionLock = master.getMasterProcedureExecutor().getProcExecutionLock();
+ long procId = master.getProcedures().stream()
+ .filter(p -> p instanceof RegionRemoteProcedureBase).findAny().get().getProcId();
+ IdLock.Entry lockEntry = procExecutionLock.getLockEntry(procId);
+ RESUME_REPORT.countDown();
+
+ // kill the source region server
+ ARRIVE_GET = new CountDownLatch(1);
+ RESUME_GET = new CountDownLatch(1);
+ UTIL.getMiniHBaseCluster().killRegionServer(source);
+
+ // wait until we try to get the region list of the region server
+ ARRIVE_GET.await();
+ ARRIVE_GET = null;
+ // release the procedure lock and let the TRSP to finish
+ procExecutionLock.releaseLockEntry(lockEntry);
+ future.get();
+
+ // resume the SCP
+ EXCLUDE_SERVERS.add(dest);
+ RESUME_GET.countDown();
+ // wait until there are no SCPs and TRSPs
+ UTIL.waitFor(60000, () -> master.getProcedures().stream().allMatch(p -> p.isFinished() ||
+ (!(p instanceof ServerCrashProcedure) && !(p instanceof TransitRegionStateProcedure))));
+
+ // assert the region is only on the dest server.
+ HRegionServer rs = UTIL.getMiniHBaseCluster().getRegionServer(dest);
+ assertNotNull(rs.getRegion(region.getEncodedName()));
+ assertNull(UTIL.getOtherRegionServer(rs).getRegion(region.getEncodedName()));
+ }
+}