You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by zh...@apache.org on 2023/05/23 14:59:43 UTC

[hbase] 01/02: HBASE-27277 TestRaceBetweenSCPAndTRSP fails in pre commit (#5248)

This is an automated email from the ASF dual-hosted git repository.

zhangduo pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/hbase.git

commit 34f724c32aee6b7fdd834fcf14a555ad0c1d3340
Author: Duo Zhang <zh...@apache.org>
AuthorDate: Tue May 23 22:45:18 2023 +0800

    HBASE-27277 TestRaceBetweenSCPAndTRSP fails in pre commit (#5248)
    
    Signed-off-by: GeorryHuang <hu...@apache.org>
    (cherry picked from commit dc30ca552b37c864b962fbeaaed548523f30573b)
---
 .../hbase/procedure2/RemoteProcedureDispatcher.java   |  7 +++++++
 .../master/assignment/TestRaceBetweenSCPAndTRSP.java  | 19 ++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
index b880043c016..da32377c25b 100644
--- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
+++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java
@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.hbase.procedure2;
 
+import com.google.errorprone.annotations.RestrictedApi;
 import java.io.IOException;
 import java.lang.Thread.UncaughtExceptionHandler;
 import java.util.HashSet;
@@ -297,6 +298,12 @@ public abstract class RemoteProcedureDispatcher<TEnv, TRemote extends Comparable
     return (List<T>) requestByType.removeAll(type);
   }
 
+  @RestrictedApi(explanation = "Should only be called in tests", link = "",
+      allowedOnPath = ".*/src/test/.*")
+  public boolean hasNode(TRemote key) {
+    return nodeMap.containsKey(key);
+  }
+
   // ============================================================================================
   // Timeout Helpers
   // ============================================================================================
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRaceBetweenSCPAndTRSP.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRaceBetweenSCPAndTRSP.java
index 1aba152d444..cb2bf8de26e 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRaceBetweenSCPAndTRSP.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRaceBetweenSCPAndTRSP.java
@@ -31,6 +31,7 @@ import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.master.HMaster;
 import org.apache.hadoop.hbase.master.MasterServices;
 import org.apache.hadoop.hbase.master.RegionPlan;
+import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher;
 import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
 import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
 import org.apache.hadoop.hbase.testclassification.LargeTests;
@@ -145,16 +146,32 @@ public class TestRaceBetweenSCPAndTRSP {
     Future<byte[]> moveFuture = am.moveAsync(new RegionPlan(region, sn, sn));
     arriveRegionOpening.await();
 
+    // Kill the region server and trigger a SCP
     UTIL.getMiniHBaseCluster().killRegionServer(sn);
+    // Wait until the SCP reaches the getRegionsOnServer call
     arriveGetRegionsOnServer.await();
+    RSProcedureDispatcher remoteDispatcher = UTIL.getMiniHBaseCluster().getMaster()
+      .getMasterProcedureExecutor().getEnvironment().getRemoteDispatcher();
+    // this is necessary for making the UT stable, the problem here is that, in
+    // ServerManager.expireServer, we will submit the SCP and then the SCP will be executed in
+    // another thread(the PEWorker), so when we reach the above getRegionsOnServer call in SCP, it
+    // is still possible that the expireServer call has not been finished so the remote dispatcher
+    // still think it can dispatcher the TRSP, in this way we will be in dead lock as the TRSP will
+    // not schedule a new ORP since it relies on SCP to wake it up after everything is OK. This is
+    // not what we want to test in this UT so we need to wait here to prevent this from happening.
+    // See HBASE-27277 for more detailed analysis.
+    UTIL.waitFor(15000, () -> !remoteDispatcher.hasNode(sn));
+
+    // Resume the TRSP, it should be able to finish
     RESUME_REGION_OPENING.countDown();
-
     moveFuture.get();
+
     ProcedureExecutor<?> procExec =
       UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
     long scpProcId =
       procExec.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure)
         .map(p -> (ServerCrashProcedure) p).findAny().get().getProcId();
+    // Resume the SCP and make sure it can finish too
     RESUME_GET_REGIONS_ON_SERVER.countDown();
     UTIL.waitFor(60000, () -> procExec.isFinished(scpProcId));
   }