You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2017/12/14 07:05:14 UTC
hbase git commit: master hangs forever if RecoverMeta send assign
meta region request to target server fail
Repository: hbase
Updated Branches:
refs/heads/master ba5f9ac38 -> d3aeaeffa
master hangs forever if RecoverMeta send assign meta region request to target server fail
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/d3aeaeff
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/d3aeaeff
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/d3aeaeff
Branch: refs/heads/master
Commit: d3aeaeffa455f32b1eb2fbb871b39707457f684d
Parents: ba5f9ac
Author: Yi Liang <yl...@us.ibm.com>
Authored: Wed Dec 13 13:49:59 2017 -0800
Committer: Michael Stack <st...@apache.org>
Committed: Wed Dec 13 22:29:19 2017 -0800
----------------------------------------------------------------------
.../hadoop/hbase/master/ServerManager.java | 4 ++
.../master/assignment/AssignmentManager.java | 32 +++++++++-
.../hbase/master/MockNoopMasterServices.java | 7 +-
.../master/assignment/MockMasterServices.java | 25 ++++++++
.../assignment/TestAssignmentManager.java | 67 ++++++++++++++++++++
5 files changed, 133 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hbase/blob/d3aeaeff/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index 79ffc8a..78661d2 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -572,6 +572,10 @@ public class ServerManager {
if (!master.isServerCrashProcessingEnabled()) {
LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
+ "delay expiring server " + serverName);
+ // Even we delay expire this server, we still need to handle Meta's RIT
+ // that are against the crashed server; since when we do RecoverMetaProcedure,
+ // the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287
+ master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName);
this.queuedDeadServers.add(serverName);
return;
}
http://git-wip-us.apache.org/repos/asf/hbase/blob/d3aeaeff/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index cebe0b0..7a1c8af 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -48,6 +48,7 @@ import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.YouAreDeadException;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
+import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.client.TableState;
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
import org.apache.hadoop.hbase.favored.FavoredNodesManager;
@@ -70,6 +71,7 @@ import org.apache.hadoop.hbase.master.normalizer.RegionNormalizer;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
+import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
@@ -78,6 +80,7 @@ import org.apache.hadoop.hbase.procedure2.ProcedureInMemoryChore;
import org.apache.hadoop.hbase.procedure2.util.StringUtils;
import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
+import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
@@ -1322,7 +1325,7 @@ public class AssignmentManager implements ServerListener {
}
public void submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) {
- boolean carryingMeta = master.getAssignmentManager().isCarryingMeta(serverName);
+ boolean carryingMeta = isCarryingMeta(serverName);
ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor();
procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), serverName,
shouldSplitWal, carryingMeta));
@@ -1853,4 +1856,31 @@ public class AssignmentManager implements ServerListener {
}*/
master.getServerManager().expireServer(serverNode.getServerName());
}
+
+ /**
+ * Handle RIT of meta region against crashed server
+ * Only used when ServerCrashProcedure is not enabled.
+ *
+ * @param serverName Server that has already crashed
+ */
+ public void handleMetaRITOnCrashedServer(ServerName serverName) {
+ RegionInfo hri = RegionReplicaUtil
+ .getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO,
+ RegionInfo.DEFAULT_REPLICA_ID);
+ RegionState regionStateNode = getRegionStates().getRegionState(hri);
+ if (!regionStateNode.getServerName().equals(serverName)) {
+ return;
+ }
+ // meta has been assigned to crashed server.
+ LOG.info("Meta has been assigned to crashed server: " + serverName + "; will do re-assign");
+ // handle failure and wake event
+ RegionTransitionProcedure rtp = getRegionStates().getRegionTransitionProcedure(hri);
+ // Not need to consider for REGION_TRANSITION_QUEUE step
+ if (rtp != null && rtp.isMeta()
+ && rtp.getTransitionState() == RegionTransitionState.REGION_TRANSITION_DISPATCH) {
+ LOG.info("Re-do rit procedure: " + rtp.toString());
+ rtp.remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(), serverName,
+ new ServerCrashException(rtp.getProcId(), serverName));
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/hbase/blob/d3aeaeff/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java
index 6a3d5c7..413abe3 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java
@@ -212,9 +212,14 @@ public class MockNoopMasterServices implements MasterServices, Server {
return null;
}
+ private boolean serverCrashProcessingEnabled = true;
+
+ public void setServerCrashProcessingEnabled(boolean b) {
+ serverCrashProcessingEnabled = b;
+ }
@Override
public boolean isServerCrashProcessingEnabled() {
- return true;
+ return serverCrashProcessingEnabled;
}
@Override
http://git-wip-us.apache.org/repos/asf/hbase/blob/d3aeaeff/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java
index 6a75729..764aa4a 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java
@@ -21,6 +21,7 @@ import static org.mockito.ArgumentMatchers.any;
import java.io.IOException;
import java.util.HashSet;
+import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.SortedSet;
@@ -173,6 +174,30 @@ public class MockMasterServices extends MockNoopMasterServices {
this.procedureExecutor.getEnvironment().setEventReady(initialized, true);
}
+ /**
+ * Call this restart method only after running MockMasterServices#start()
+ * The RSs can be differentiated by the port number, see
+ * ServerName in MockMasterServices#start() method above.
+ * Restart of region server will have new startcode in server name
+ *
+ * @param serverName Server name to be restarted
+ */
+ public void restartRegionServer(ServerName serverName) throws IOException {
+ List<ServerName> onlineServers = serverManager.getOnlineServersList();
+ long startCode = -1;
+ for (ServerName s : onlineServers) {
+ if (s.getAddress().equals(serverName.getAddress())) {
+ startCode = s.getStartcode() + 1;
+ break;
+ }
+ }
+ if (startCode == -1) {
+ return;
+ }
+ ServerName sn = ServerName.valueOf(serverName.getAddress().toString(), startCode);
+ serverManager.regionServerReport(sn, ServerLoad.EMPTY_SERVERLOAD);
+ }
+
@Override
public void stop(String why) {
stopProcedureExecutor();
http://git-wip-us.apache.org/repos/asf/hbase/blob/d3aeaeff/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
index 1912d11..c328c71 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
@@ -444,6 +444,34 @@ public class TestAssignmentManager {
assertEquals(unassignFailedCount, unassignProcMetrics.getFailedCounter().getCount());
}
+ /**
+ * It is possible that when AM send assign meta request to a RS successfully,
+ * but RS can not send back any response, which cause master startup hangs forever
+ */
+ @Test
+ public void testAssignMetaAndCrashBeforeResponse() throws Exception {
+ tearDown();
+ // See setUp(), start HBase until set up meta
+ UTIL = new HBaseTestingUtility();
+ this.executor = Executors.newSingleThreadScheduledExecutor();
+ setupConfiguration(UTIL.getConfiguration());
+ master = new MockMasterServices(UTIL.getConfiguration(), this.regionsToRegionServers);
+ rsDispatcher = new MockRSProcedureDispatcher(master);
+ master.start(NSERVERS, rsDispatcher);
+ am = master.getAssignmentManager();
+
+ // Assign meta
+ master.setServerCrashProcessingEnabled(false);
+ rsDispatcher.setMockRsExecutor(new HangThenRSRestartExecutor());
+ am.assign(RegionInfoBuilder.FIRST_META_REGIONINFO);
+ assertEquals(true, am.isMetaInitialized());
+
+ // set it back as default, see setUpMeta()
+ master.setServerCrashProcessingEnabled(true);
+ am.wakeMetaLoadedEvent();
+ am.setFailoverCleanupDone(true);
+ }
+
private Future<byte[]> submitProcedure(final Procedure proc) {
return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
}
@@ -527,6 +555,14 @@ public class TestAssignmentManager {
this.am.submitServerCrash(serverName, false/*No WALs here*/);
}
+ private void doRestart(final ServerName serverName) {
+ try {
+ this.master.restartRegionServer(serverName);
+ } catch (IOException e) {
+ LOG.warn("Can not restart RS with new startcode");
+ }
+ }
+
private class NoopRsExecutor implements MockRSExecutor {
public ExecuteProceduresResponse sendRequest(ServerName server,
ExecuteProceduresRequest request) throws IOException {
@@ -678,6 +714,37 @@ public class TestAssignmentManager {
}
}
+ /**
+ * Takes open request and then returns nothing so acts like a RS that went zombie.
+ * No response (so proc is stuck/suspended on the Master and won't wake up.).
+ * Different with HangThenRSCrashExecutor, HangThenRSCrashExecutor will create
+ * ServerCrashProcedure to handle the server crash. However, this HangThenRSRestartExecutor
+ * will restart RS directly, situation for RS crashed when SCP is not enabled.
+ */
+ private class HangThenRSRestartExecutor extends GoodRsExecutor {
+ private int invocations;
+
+ @Override
+ protected RegionOpeningState execOpenRegion(final ServerName server, RegionOpenInfo openReq)
+ throws IOException {
+ if (this.invocations++ > 0) {
+ // Return w/o problem the second time through here.
+ return super.execOpenRegion(server, openReq);
+ }
+ // The procedure on master will just hang forever because nothing comes back
+ // from the RS in this case.
+ LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
+ executor.schedule(new Runnable() {
+ @Override
+ public void run() {
+ LOG.info("Restarting RS of " + server);
+ doRestart(server);
+ }
+ }, 1, TimeUnit.SECONDS);
+ return null;
+ }
+ }
+
private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
public static final int TYPES_OF_FAILURE = 6;
private int invocations;