You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2017/12/14 07:05:14 UTC

hbase git commit: master hangs forever if RecoverMeta send assign meta region request to target server fail

Repository: hbase
Updated Branches:
  refs/heads/master ba5f9ac38 -> d3aeaeffa


master hangs forever if RecoverMeta send assign meta region request to target server fail


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/d3aeaeff
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/d3aeaeff
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/d3aeaeff

Branch: refs/heads/master
Commit: d3aeaeffa455f32b1eb2fbb871b39707457f684d
Parents: ba5f9ac
Author: Yi Liang <yl...@us.ibm.com>
Authored: Wed Dec 13 13:49:59 2017 -0800
Committer: Michael Stack <st...@apache.org>
Committed: Wed Dec 13 22:29:19 2017 -0800

----------------------------------------------------------------------
 .../hadoop/hbase/master/ServerManager.java      |  4 ++
 .../master/assignment/AssignmentManager.java    | 32 +++++++++-
 .../hbase/master/MockNoopMasterServices.java    |  7 +-
 .../master/assignment/MockMasterServices.java   | 25 ++++++++
 .../assignment/TestAssignmentManager.java       | 67 ++++++++++++++++++++
 5 files changed, 133 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/d3aeaeff/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index 79ffc8a..78661d2 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -572,6 +572,10 @@ public class ServerManager {
     if (!master.isServerCrashProcessingEnabled()) {
       LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
           + "delay expiring server " + serverName);
+      // Even we delay expire this server, we still need to handle Meta's RIT
+      // that are against the crashed server; since when we do RecoverMetaProcedure,
+      // the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287
+      master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName);
       this.queuedDeadServers.add(serverName);
       return;
     }

http://git-wip-us.apache.org/repos/asf/hbase/blob/d3aeaeff/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
index cebe0b0..7a1c8af 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@@ -48,6 +48,7 @@ import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.YouAreDeadException;
 import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.client.RegionInfoBuilder;
+import org.apache.hadoop.hbase.client.RegionReplicaUtil;
 import org.apache.hadoop.hbase.client.TableState;
 import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
 import org.apache.hadoop.hbase.favored.FavoredNodesManager;
@@ -70,6 +71,7 @@ import org.apache.hadoop.hbase.master.normalizer.RegionNormalizer;
 import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
 import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
 import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
+import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
 import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
 import org.apache.hadoop.hbase.procedure2.Procedure;
 import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
@@ -78,6 +80,7 @@ import org.apache.hadoop.hbase.procedure2.ProcedureInMemoryChore;
 import org.apache.hadoop.hbase.procedure2.util.StringUtils;
 import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting;
 import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
+import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
@@ -1322,7 +1325,7 @@ public class AssignmentManager implements ServerListener {
   }
 
   public void submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) {
-    boolean carryingMeta = master.getAssignmentManager().isCarryingMeta(serverName);
+    boolean carryingMeta = isCarryingMeta(serverName);
     ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor();
     procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), serverName,
       shouldSplitWal, carryingMeta));
@@ -1853,4 +1856,31 @@ public class AssignmentManager implements ServerListener {
     }*/
     master.getServerManager().expireServer(serverNode.getServerName());
   }
+
+  /**
+   * Handle RIT of meta region against crashed server
+   * Only used when ServerCrashProcedure is not enabled.
+   *
+   * @param serverName Server that has already crashed
+   */
+  public void handleMetaRITOnCrashedServer(ServerName serverName) {
+    RegionInfo hri = RegionReplicaUtil
+        .getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO,
+            RegionInfo.DEFAULT_REPLICA_ID);
+    RegionState regionStateNode = getRegionStates().getRegionState(hri);
+    if (!regionStateNode.getServerName().equals(serverName)) {
+      return;
+    }
+    // meta has been assigned to crashed server.
+    LOG.info("Meta has been assigned to crashed server: " + serverName + "; will do re-assign");
+    // handle failure and wake event
+    RegionTransitionProcedure rtp = getRegionStates().getRegionTransitionProcedure(hri);
+    // Not need to consider for REGION_TRANSITION_QUEUE step
+    if (rtp != null && rtp.isMeta()
+        && rtp.getTransitionState() == RegionTransitionState.REGION_TRANSITION_DISPATCH) {
+      LOG.info("Re-do rit procedure: " + rtp.toString());
+      rtp.remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(), serverName,
+          new ServerCrashException(rtp.getProcId(), serverName));
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/d3aeaeff/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java
index 6a3d5c7..413abe3 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java
@@ -212,9 +212,14 @@ public class MockNoopMasterServices implements MasterServices, Server {
     return null;
   }
 
+  private boolean serverCrashProcessingEnabled = true;
+
+  public void setServerCrashProcessingEnabled(boolean b) {
+    serverCrashProcessingEnabled = b;
+  }
   @Override
   public boolean isServerCrashProcessingEnabled() {
-    return true;
+    return serverCrashProcessingEnabled;
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/hbase/blob/d3aeaeff/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java
index 6a75729..764aa4a 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java
@@ -21,6 +21,7 @@ import static org.mockito.ArgumentMatchers.any;
 
 import java.io.IOException;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Map;
 import java.util.NavigableMap;
 import java.util.SortedSet;
@@ -173,6 +174,30 @@ public class MockMasterServices extends MockNoopMasterServices {
     this.procedureExecutor.getEnvironment().setEventReady(initialized, true);
   }
 
+  /**
+   * Call this restart method only after running MockMasterServices#start()
+   * The RSs can be differentiated by the port number, see
+   * ServerName in MockMasterServices#start() method above.
+   * Restart of region server will have new startcode in server name
+   *
+   * @param serverName Server name to be restarted
+   */
+  public void restartRegionServer(ServerName serverName) throws IOException {
+    List<ServerName> onlineServers = serverManager.getOnlineServersList();
+    long startCode = -1;
+    for (ServerName s : onlineServers) {
+      if (s.getAddress().equals(serverName.getAddress())) {
+        startCode = s.getStartcode() + 1;
+        break;
+      }
+    }
+    if (startCode == -1) {
+      return;
+    }
+    ServerName sn = ServerName.valueOf(serverName.getAddress().toString(), startCode);
+    serverManager.regionServerReport(sn, ServerLoad.EMPTY_SERVERLOAD);
+  }
+
   @Override
   public void stop(String why) {
     stopProcedureExecutor();

http://git-wip-us.apache.org/repos/asf/hbase/blob/d3aeaeff/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
index 1912d11..c328c71 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
@@ -444,6 +444,34 @@ public class TestAssignmentManager {
     assertEquals(unassignFailedCount, unassignProcMetrics.getFailedCounter().getCount());
   }
 
+  /**
+   * It is possible that when AM send assign meta request to a RS successfully,
+   * but RS can not send back any response, which cause master startup hangs forever
+   */
+  @Test
+  public void testAssignMetaAndCrashBeforeResponse() throws Exception {
+    tearDown();
+    // See setUp(), start HBase until set up meta
+    UTIL = new HBaseTestingUtility();
+    this.executor = Executors.newSingleThreadScheduledExecutor();
+    setupConfiguration(UTIL.getConfiguration());
+    master = new MockMasterServices(UTIL.getConfiguration(), this.regionsToRegionServers);
+    rsDispatcher = new MockRSProcedureDispatcher(master);
+    master.start(NSERVERS, rsDispatcher);
+    am = master.getAssignmentManager();
+
+    // Assign meta
+    master.setServerCrashProcessingEnabled(false);
+    rsDispatcher.setMockRsExecutor(new HangThenRSRestartExecutor());
+    am.assign(RegionInfoBuilder.FIRST_META_REGIONINFO);
+    assertEquals(true, am.isMetaInitialized());
+
+    // set it back as default, see setUpMeta()
+    master.setServerCrashProcessingEnabled(true);
+    am.wakeMetaLoadedEvent();
+    am.setFailoverCleanupDone(true);
+  }
+
   private Future<byte[]> submitProcedure(final Procedure proc) {
     return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
   }
@@ -527,6 +555,14 @@ public class TestAssignmentManager {
     this.am.submitServerCrash(serverName, false/*No WALs here*/);
   }
 
+  private void doRestart(final ServerName serverName) {
+    try {
+      this.master.restartRegionServer(serverName);
+    } catch (IOException e) {
+      LOG.warn("Can not restart RS with new startcode");
+    }
+  }
+
   private class NoopRsExecutor implements MockRSExecutor {
     public ExecuteProceduresResponse sendRequest(ServerName server,
         ExecuteProceduresRequest request) throws IOException {
@@ -678,6 +714,37 @@ public class TestAssignmentManager {
     }
   }
 
+  /**
+   * Takes open request and then returns nothing so acts like a RS that went zombie.
+   * No response (so proc is stuck/suspended on the Master and won't wake up.).
+   * Different with HangThenRSCrashExecutor,  HangThenRSCrashExecutor will create
+   * ServerCrashProcedure to handle the server crash. However, this HangThenRSRestartExecutor
+   * will restart RS directly, situation for RS crashed when SCP is not enabled.
+   */
+  private class HangThenRSRestartExecutor extends GoodRsExecutor {
+    private int invocations;
+
+    @Override
+    protected RegionOpeningState execOpenRegion(final ServerName server, RegionOpenInfo openReq)
+        throws IOException {
+      if (this.invocations++ > 0) {
+        // Return w/o problem the second time through here.
+        return super.execOpenRegion(server, openReq);
+      }
+      // The procedure on master will just hang forever because nothing comes back
+      // from the RS in this case.
+      LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
+      executor.schedule(new Runnable() {
+        @Override
+        public void run() {
+          LOG.info("Restarting RS of " + server);
+          doRestart(server);
+        }
+      }, 1, TimeUnit.SECONDS);
+      return null;
+    }
+  }
+
   private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
     public static final int TYPES_OF_FAILURE = 6;
     private int invocations;