You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ps...@apache.org on 2021/03/11 16:51:54 UTC

[hbase] branch branch-2.3 updated: HBASE-25587 [hbck2] Schedule SCP for all unknown servers (#2978)

This is an automated email from the ASF dual-hosted git repository.

psomogyi pushed a commit to branch branch-2.3
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2.3 by this push:
     new 092bbaf  HBASE-25587 [hbck2] Schedule SCP for all unknown servers (#2978)
092bbaf is described below

commit 092bbaf9b4d78841401ca657e330e78327e10176
Author: Peter Somogyi <ps...@apache.org>
AuthorDate: Thu Mar 11 17:20:36 2021 +0100

    HBASE-25587 [hbck2] Schedule SCP for all unknown servers (#2978)
    
    Signed-off-by: Wellington Chevreuil <wc...@apache.org>
---
 .../org/apache/hadoop/hbase/client/HBaseHbck.java  | 16 +++++++
 .../java/org/apache/hadoop/hbase/client/Hbck.java  |  2 +
 .../src/main/protobuf/Master.proto                 | 10 ++++
 .../hadoop/hbase/master/MasterRpcServices.java     | 25 ++++++++++
 .../apache/hadoop/hbase/master/ServerManager.java  | 11 +++++
 .../hadoop/hbase/master/procedure/TestHBCKSCP.java | 30 +++++++-----
 .../hbase/master/procedure/TestHBCKSCPUnknown.java | 55 ++++++++++++++++++++++
 7 files changed, 137 insertions(+), 12 deletions(-)

diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java
index d146a90..85920b6 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java
@@ -45,6 +45,8 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.HbckServic
 import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RegionSpecifierAndState;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreRequest;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.RunHbckChoreResponse;
+import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleSCPsForUnknownServersRequest;
+import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleSCPsForUnknownServersResponse;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ScheduleServerCrashProcedureResponse;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.UnassignsResponse;
 
@@ -202,6 +204,20 @@ public class HBaseHbck implements Hbck {
   }
 
   @Override
+  public List<Long> scheduleSCPsForUnknownServers() throws IOException {
+    try {
+      ScheduleSCPsForUnknownServersResponse response =
+        this.hbck.scheduleSCPsForUnknownServers(
+          rpcControllerFactory.newController(),
+          ScheduleSCPsForUnknownServersRequest.newBuilder().build());
+      return response.getPidList();
+    } catch (ServiceException se) {
+      LOG.debug("Failed to run ServerCrashProcedures for unknown servers", se);
+      throw new IOException(se);
+    }
+  }
+
+  @Override
   public boolean runHbckChore() throws IOException {
     try {
       RunHbckChoreResponse response = this.hbck.runHbckChore(rpcControllerFactory.newController(),
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java
index 57d2912..99befa4 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java
@@ -130,6 +130,8 @@ public interface Hbck extends Abortable, Closeable {
 
   List<Long> scheduleServerCrashProcedures(List<ServerName> serverNames) throws IOException;
 
+  List<Long> scheduleSCPsForUnknownServers() throws IOException;
+
   /**
    * Request HBCK chore to run at master side.
    *
diff --git a/hbase-protocol-shaded/src/main/protobuf/Master.proto b/hbase-protocol-shaded/src/main/protobuf/Master.proto
index 505eb81..f72acc1 100644
--- a/hbase-protocol-shaded/src/main/protobuf/Master.proto
+++ b/hbase-protocol-shaded/src/main/protobuf/Master.proto
@@ -1151,6 +1151,12 @@ message ScheduleServerCrashProcedureResponse {
   repeated uint64 pid = 1;
 }
 
+message ScheduleSCPsForUnknownServersRequest {}
+
+message ScheduleSCPsForUnknownServersResponse {
+  repeated uint64 pid = 1;
+}
+
 message FixMetaRequest {}
 
 message FixMetaResponse {}
@@ -1190,6 +1196,10 @@ service HbckService {
   rpc ScheduleServerCrashProcedure(ScheduleServerCrashProcedureRequest)
     returns(ScheduleServerCrashProcedureResponse);
 
+  /** Schedule a ServerCrashProcedure for unknown servers */
+  rpc ScheduleSCPsForUnknownServers(ScheduleSCPsForUnknownServersRequest)
+    returns(ScheduleSCPsForUnknownServersResponse);
+
   /**
    * Request HBCK chore to run at master side.
    */
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java
index 566482a..02d2dc0 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java
@@ -2667,6 +2667,31 @@ public class MasterRpcServices extends RSRpcServices implements
   }
 
   @Override
+  public MasterProtos.ScheduleSCPsForUnknownServersResponse scheduleSCPsForUnknownServers(
+      RpcController controller, MasterProtos.ScheduleSCPsForUnknownServersRequest request)
+      throws ServiceException {
+
+    List<Long> pids = new ArrayList<>();
+    final Set<ServerName> serverNames =
+      master.getAssignmentManager().getRegionStates().getRegionStates().stream()
+        .map(RegionState::getServerName).collect(Collectors.toSet());
+
+    final Set<ServerName> unknownServerNames = serverNames.stream()
+      .filter(sn -> master.getServerManager().isServerUnknown(sn)).collect(Collectors.toSet());
+
+    for (ServerName sn: unknownServerNames) {
+      LOG.info("{} schedule ServerCrashProcedure for unknown {}",
+        this.master.getClientIdAuditPrefix(), sn);
+      if (shouldSubmitSCP(sn)) {
+        pids.add(this.master.getServerManager().expireServer(sn, true));
+      } else {
+        pids.add(Procedure.NO_PROC_ID);
+      }
+    }
+    return MasterProtos.ScheduleSCPsForUnknownServersResponse.newBuilder().addAllPid(pids).build();
+  }
+
+  @Override
   public FixMetaResponse fixMeta(RpcController controller, FixMetaRequest request)
       throws ServiceException {
     rpcPreCheck("fixMeta");
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
index af12e3a..9dc3957 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@@ -907,6 +907,17 @@ public class ServerManager {
     return serverName == null || deadservers.isDeadServer(serverName);
   }
 
+  /**
+   * Check if a server is unknown.  A server can be online,
+   * or known to be dead, or unknown to this manager (i.e, not online,
+   * not known to be dead either; it is simply not tracked by the
+   * master any more, for example, a very old previous instance).
+   */
+  public boolean isServerUnknown(ServerName serverName) {
+    return serverName == null
+      || (!onlineServers.containsKey(serverName) && !deadservers.isDeadServer(serverName));
+  }
+
   public void shutdownCluster() {
     String statusStr = "Cluster shutdown requested of master=" + this.master.getServerName();
     LOG.info(statusStr);
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java
index a4d251f..beb4fe3 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java
@@ -40,12 +40,8 @@ import org.apache.hadoop.hbase.master.HMaster;
 import org.apache.hadoop.hbase.master.RegionState;
 import org.apache.hadoop.hbase.procedure2.Procedure;
 import org.apache.hadoop.hbase.regionserver.HRegionServer;
-import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
-import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
-
 import org.apache.hadoop.hbase.testclassification.LargeTests;
 import org.apache.hadoop.hbase.testclassification.MasterTests;
-
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.Pair;
 import org.apache.hadoop.hbase.util.Threads;
@@ -57,6 +53,10 @@ import org.junit.rules.TestName;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
+import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
+import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
+
 
 /**
  * Test of the HBCK-version of SCP.
@@ -109,15 +109,16 @@ public class TestHBCKSCP extends TestSCPBase {
     // not be processing this server 'normally'. Remove it from processing by
     // calling 'finish' and then remove it from dead servers so rsServerName
     // becomes an 'Unknown Server' even though it is still around.
+    LOG.info("Killing {}", rsServerName);
+    cluster.killRegionServer(rsServerName);
+
     master.getServerManager().moveFromOnlineToDeadServers(rsServerName);
     master.getServerManager().getDeadServers().finish(rsServerName);
     master.getServerManager().getDeadServers().removeDeadServer(rsServerName);
     master.getAssignmentManager().getRegionStates().removeServer(rsServerName);
     // Kill the server. Nothing should happen since an 'Unknown Server' as far
     // as the Master is concerned; i.e. no SCP.
-    LOG.info("Killing {}", rsServerName);
     HRegionServer hrs = cluster.getRegionServer(rsServerName);
-    hrs.abort("KILLED");
     while (!hrs.isStopped()) {
       Threads.sleep(10);
     }
@@ -135,12 +136,7 @@ public class TestHBCKSCP extends TestSCPBase {
 
     // I now have 'Unknown Server' references in hbase:meta; i.e. Server references
     // with no corresponding SCP. Queue one.
-    MasterProtos.ScheduleServerCrashProcedureResponse response =
-        master.getMasterRpcServices().scheduleServerCrashProcedure(null,
-            MasterProtos.ScheduleServerCrashProcedureRequest.newBuilder().
-                addServerName(ProtobufUtil.toServerName(rsServerName)).build());
-    assertEquals(1, response.getPidCount());
-    long pid = response.getPid(0);
+    long pid = scheduleHBCKSCP(rsServerName, master);
     assertNotEquals(Procedure.NO_PROC_ID, pid);
     while (master.getMasterProcedureExecutor().getActiveProcIds().contains(pid)) {
       Threads.sleep(10);
@@ -156,6 +152,16 @@ public class TestHBCKSCP extends TestSCPBase {
     assertFalse(searchMeta(master, rsServerName));
   }
 
+  protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
+    MasterProtos.ScheduleServerCrashProcedureResponse response =
+        master.getMasterRpcServices().scheduleServerCrashProcedure(null,
+            MasterProtos.ScheduleServerCrashProcedureRequest.newBuilder().
+                addServerName(ProtobufUtil.toServerName(rsServerName)).build());
+    assertEquals(1, response.getPidCount());
+    long pid = response.getPid(0);
+    return pid;
+  }
+
   /**
    * @return True if we find reference to <code>sn</code> in meta table.
    */
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCPUnknown.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCPUnknown.java
new file mode 100644
index 0000000..6702f40
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCPUnknown.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.procedure;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.master.HMaster;
+import org.apache.hadoop.hbase.testclassification.LargeTests;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+
+import org.junit.ClassRule;
+import org.junit.experimental.categories.Category;
+import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
+import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
+
+
+/**
+ * Test of the HBCK-version of SCP.
+ * The HBCKSCP is an SCP only it reads hbase:meta for list of Regions that were
+ * on the server-to-process rather than consult Master in-memory-state.
+ */
+@Category({ MasterTests.class, LargeTests.class })
+public class TestHBCKSCPUnknown extends TestHBCKSCP {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+      HBaseClassTestRule.forClass(TestHBCKSCPUnknown.class);
+
+  @Override
+  protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
+    MasterProtos.ScheduleSCPsForUnknownServersResponse response =
+        master.getMasterRpcServices().scheduleSCPsForUnknownServers(null,
+            MasterProtos.ScheduleSCPsForUnknownServersRequest.newBuilder().build());
+    assertEquals(1, response.getPidCount());
+    long pid = response.getPid(0);
+    return pid;
+  }
+}