You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by zh...@apache.org on 2020/06/09 03:26:15 UTC

[hbase] branch branch-2 updated: HBASE-24117 Shutdown AssignmentManager before ProcedureExecutor may cause SCP to accidentally skip assigning a region (#1865)

This is an automated email from the ASF dual-hosted git repository.

zhangduo pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2 by this push:
     new c5dacfb  HBASE-24117 Shutdown AssignmentManager before ProcedureExecutor may cause SCP to accidentally skip assigning a region (#1865)
c5dacfb is described below

commit c5dacfbbea2547a77bae47513089408eeabb61da
Author: Duo Zhang <zh...@apache.org>
AuthorDate: Tue Jun 9 11:07:16 2020 +0800

    HBASE-24117 Shutdown AssignmentManager before ProcedureExecutor may cause SCP to accidentally skip assigning a region (#1865)
    
    Signed-off-by: Michael Stack <st...@apache.org>
---
 .../src/main/java/org/apache/hadoop/hbase/master/HMaster.java      | 7 +++++--
 .../apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java | 7 +++++++
 .../hbase/master/assignment/TestCloseRegionWhileRSCrash.java       | 2 +-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index dd069d9..c497c12 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -1502,6 +1502,11 @@ public class HMaster extends HRegionServer implements MasterServices {
 
     LOG.debug("Stopping service threads");
 
+    // stop procedure executor prior to other services such as server manager and assignment
+    // manager, as these services are important for some running procedures. See HBASE-24117 for
+    // example.
+    stopProcedureExecutor();
+
     if (this.quotaManager != null) {
       this.quotaManager.stop();
     }
@@ -1516,8 +1521,6 @@ public class HMaster extends HRegionServer implements MasterServices {
       this.assignmentManager.stop();
     }
 
-    stopProcedureExecutor();
-
     if (masterRegion != null) {
       masterRegion.close(isAborted());
     }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
index 076c266..6ca8c0c 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
@@ -24,6 +24,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import org.apache.hadoop.hbase.DoNotRetryIOException;
 import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.client.RegionInfoBuilder;
@@ -485,6 +486,12 @@ public class ServerCrashProcedure
         // UPDATE: HBCKServerCrashProcedure overrides isMatchingRegionLocation; this check can get
         // in the way of our clearing out 'Unknown Servers'.
         if (!isMatchingRegionLocation(regionNode)) {
+          // See HBASE-24117, though we have already changed the shutdown order, it is still worth
+          // double checking here to confirm that we do not skip assignment incorrectly.
+          if (!am.isRunning()) {
+            throw new DoNotRetryIOException(
+              "AssignmentManager has been stopped, can not process assignment any more");
+          }
           LOG.info("{} found {} whose regionLocation no longer matches {}, skipping assign...",
             this, regionNode, serverName);
           continue;
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java
index 4761991..75f73e5 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java
@@ -163,7 +163,7 @@ public class TestCloseRegionWhileRSCrash {
     UTIL.shutdownMiniCluster();
   }
 
-  @org.junit.Ignore @Test // Until root-cause of flakeyness, HBASE-24117, is addressed.
+  @Test
   public void testRetryBackoff() throws IOException, InterruptedException {
     HRegionServer srcRs = UTIL.getRSForFirstRegionInTable(TABLE_NAME);
     RegionInfo region = srcRs.getRegions(TABLE_NAME).get(0).getRegionInfo();