You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2020/01/02 22:32:57 UTC

[hbase] branch branch-2.2 updated: HBASE-23596 HBCKServerCrashProcedure can double assign

This is an automated email from the ASF dual-hosted git repository.

stack pushed a commit to branch branch-2.2
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-2.2 by this push:
     new 189a3b1  HBASE-23596 HBCKServerCrashProcedure can double assign
189a3b1 is described below

commit 189a3b1cf6ce832ce7aa832fffa25c7179f5683e
Author: stack <st...@apache.org>
AuthorDate: Wed Dec 18 23:08:24 2019 -0800

    HBASE-23596 HBCKServerCrashProcedure can double assign
    
    Signed-off-by: Duo Zhang <zh...@apache.org>
    Signed-off-by: Lijin Bin <bi...@apache.org>
    Signed-off-by: Viraj Jasani <vj...@apache.org>
    
    Change its behavior so it will only look in hbase:meta
    if the call to the super class turns up zero references.
    Only then will it search hbase:meta for references to
    'Unknown Servers'. Normal operation where we read Master
    context is usual and sufficient. The scan of hbase:meta
    is only for case where Master state has been corrupted
    and we need to clear out 'Unknown Servers'.
---
 .../org/apache/hadoop/hbase/MetaTableAccessor.java |  13 ++-
 .../hbase/master/assignment/RegionStateStore.java  |  10 +-
 .../master/procedure/HBCKServerCrashProcedure.java | 120 +++++++++++++++++----
 .../apache/hadoop/hbase/HBaseTestingUtility.java   |   3 +-
 .../hadoop/hbase/master/procedure/TestHBCKSCP.java |   1 +
 5 files changed, 116 insertions(+), 31 deletions(-)

diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
index d9dc779..0e41146 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
@@ -1448,7 +1448,7 @@ public class MetaTableAccessor {
     }
   }
 
-  private static void addRegionStateToPut(Put put, RegionState.State state) throws IOException {
+  private static Put addRegionStateToPut(Put put, RegionState.State state) throws IOException {
     put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY)
         .setRow(put.getRow())
         .setFamily(HConstants.CATALOG_FAMILY)
@@ -1457,6 +1457,17 @@ public class MetaTableAccessor {
         .setType(Cell.Type.Put)
         .setValue(Bytes.toBytes(state.name()))
         .build());
+    return put;
+  }
+
+  /**
+   * Update state column in hbase:meta.
+   */
+  public static void updateRegionState(Connection connection, RegionInfo ri,
+      RegionState.State state) throws IOException {
+    Put put = new Put(RegionReplicaUtil.getRegionInfoForDefaultReplica(ri).getRegionName());
+    MetaTableAccessor.putsToMetaTable(connection,
+        Collections.singletonList(addRegionStateToPut(put, state)));
   }
 
   /**
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java
index df7f1bb..5ee22c3 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java
@@ -51,7 +51,6 @@ import org.apache.zookeeper.KeeperException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
 import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
 
 /**
@@ -115,7 +114,7 @@ public class RegionStateStore {
       if (regionInfo == null) continue;
 
       final int replicaId = regionInfo.getReplicaId();
-      final State state = getRegionState(result, replicaId, regionInfo);
+      final State state = getRegionState(result, regionInfo);
 
       final ServerName lastHost = hrl.getServerName();
       final ServerName regionLocation = getRegionServer(result, replicaId);
@@ -326,12 +325,11 @@ public class RegionStateStore {
 
   /**
    * Pull the region state from a catalog table {@link Result}.
-   * @param r Result to pull the region state from
    * @return the region state, or null if unknown.
    */
-  @VisibleForTesting
-  public static State getRegionState(final Result r, int replicaId, RegionInfo regionInfo) {
-    Cell cell = r.getColumnLatestCell(HConstants.CATALOG_FAMILY, getStateColumn(replicaId));
+  public static State getRegionState(final Result r, RegionInfo regionInfo) {
+    Cell cell = r.getColumnLatestCell(HConstants.CATALOG_FAMILY,
+        getStateColumn(regionInfo.getReplicaId()));
     if (cell == null || cell.getValueLength() == 0) {
       return null;
     }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java
index 874dcde19..eec820c 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java
@@ -20,21 +20,31 @@ package org.apache.hadoop.hbase.master.procedure;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.stream.Collectors;
 
+import org.apache.hadoop.hbase.HRegionLocation;
 import org.apache.hadoop.hbase.MetaTableAccessor;
+import org.apache.hadoop.hbase.RegionLocations;
 import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.client.Connection;
 import org.apache.hadoop.hbase.client.RegionInfo;
-import org.apache.hadoop.hbase.util.Pair;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.master.RegionState;
+import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * A SCP that differs from default only in how it gets the list of
- * Regions hosted on the crashed-server; it also reads hbase:meta directly rather
- * than rely solely on Master memory for list of Regions that were on crashed server.
- * This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's
- * scheduleRecoveries). It is for the case where meta has references to 'Unknown Servers',
+ * Acts like the super class in all cases except when no Regions found in the
+ * current Master in-memory context. In this latter case, when the call to
+ * super#getRegionsOnCrashedServer returns nothing, this SCP will scan
+ * hbase:meta for references to the passed ServerName. If any found, we'll
+ * clean them up.
+ *
+ * <p>This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's
+ * scheduleRecoveries); the super class is used during normal recovery operations.
+ * It is for the case where meta has references to 'Unknown Servers',
  * servers that are in hbase:meta but not in live-server or dead-server lists; i.e. Master
  * and hbase:meta content have deviated. It should never happen in normal running
  * cluster but if we do drop accounting of servers, we need a means of fix-up.
@@ -65,31 +75,97 @@ public class HBCKServerCrashProcedure extends ServerCrashProcedure {
   public HBCKServerCrashProcedure() {}
 
   /**
-   * Adds Regions found by super method any found scanning hbase:meta.
+   * If no Regions found in Master context, then we will search hbase:meta for references
+   * to the passed server. Operator may have passed ServerName because they have found
+   * references to 'Unknown Servers'. They are using HBCKSCP to clear them out.
    */
   @Override
   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NP_NULL_ON_SOME_PATH_EXCEPTION",
     justification="FindBugs seems confused on ps in below.")
   List<RegionInfo> getRegionsOnCrashedServer(MasterProcedureEnv env) {
-    // Super can return immutable emptyList.
+    // Super will return an immutable list (empty if nothing on this server).
     List<RegionInfo> ris = super.getRegionsOnCrashedServer(env);
-    List<Pair<RegionInfo, ServerName>> ps = null;
+    if (!ris.isEmpty()) {
+      return ris;
+    }
+    // Nothing in in-master context. Check for Unknown Server! in hbase:meta.
+    // If super list is empty, then allow that an operator scheduled an SCP because they are trying
+    // to purge 'Unknown Servers' -- servers that are neither online nor in dead servers
+    // list but that ARE in hbase:meta and so showing as unknown in places like 'HBCK Report'.
+    // This mis-accounting does not happen in normal circumstance but may arise in-extremis
+    // when cluster has been damaged in operation.
+    UnknownServerVisitor visitor =
+        new UnknownServerVisitor(env.getMasterServices().getConnection(), getServerName());
     try {
-      ps = MetaTableAccessor.getTableRegionsAndLocations(env.getMasterServices().getConnection(),
-              null, false);
+      MetaTableAccessor.scanMetaForTableRegions(env.getMasterServices().getConnection(),
+          visitor, null);
     } catch (IOException ioe) {
-      LOG.warn("Failed get of all regions; continuing", ioe);
-    }
-    if (ps == null || ps.isEmpty()) {
-      LOG.warn("No regions found in hbase:meta");
+      LOG.warn("Failed scan of hbase:meta for 'Unknown Servers'", ioe);
       return ris;
     }
-    List<RegionInfo> aggregate = ris == null || ris.isEmpty()?
-        new ArrayList<>(): new ArrayList<>(ris);
-    int before = aggregate.size();
-    ps.stream().filter(p -> p.getSecond() != null && p.getSecond().equals(getServerName())).
-        forEach(p -> aggregate.add(p.getFirst()));
-    LOG.info("Found {} mentions of {} in hbase:meta", aggregate.size() - before, getServerName());
-    return aggregate;
+    LOG.info("Found {} mentions of {} in hbase:meta of OPEN/OPENING Regions: {}",
+        visitor.getReassigns().size(), getServerName(),
+        visitor.getReassigns().stream().map(RegionInfo::getEncodedName).
+            collect(Collectors.joining(",")));
+    return visitor.getReassigns();
+  }
+
+  /**
+   * Visitor for hbase:meta that 'fixes' Unknown Server issues. Collects
+   * a List of Regions to reassign as 'result'.
+   */
+  private static class UnknownServerVisitor implements MetaTableAccessor.Visitor {
+    private final List<RegionInfo> reassigns = new ArrayList<>();
+    private final ServerName unknownServerName;
+    private final Connection connection;
+
+    private UnknownServerVisitor(Connection connection, ServerName unknownServerName) {
+      this.connection = connection;
+      this.unknownServerName = unknownServerName;
+    }
+
+    @Override
+    public boolean visit(Result result) throws IOException {
+      RegionLocations rls = MetaTableAccessor.getRegionLocations(result);
+      if (rls == null) {
+        return true;
+      }
+      for (HRegionLocation hrl: rls.getRegionLocations()) {
+        if (hrl == null) {
+          continue;
+        }
+        if (hrl.getRegion() == null) {
+          continue;
+        }
+        if (hrl.getServerName() == null) {
+          continue;
+        }
+        if (!hrl.getServerName().equals(this.unknownServerName)) {
+          continue;
+        }
+        RegionState.State state = RegionStateStore.getRegionState(result, hrl.getRegion());
+        RegionState rs = new RegionState(hrl.getRegion(), state, hrl.getServerName());
+        if (rs.isClosing()) {
+          // Move region to CLOSED in hbase:meta.
+          LOG.info("Moving {} from CLOSING to CLOSED in hbase:meta",
+              hrl.getRegion().getRegionNameAsString());
+          try {
+            MetaTableAccessor.updateRegionState(this.connection, hrl.getRegion(),
+                RegionState.State.CLOSED);
+          } catch (IOException ioe) {
+            LOG.warn("Failed moving {} from CLOSING to CLOSED", ioe);
+          }
+        } else if (rs.isOpening() || rs.isOpened()) {
+          this.reassigns.add(hrl.getRegion());
+        } else {
+          LOG.info("Passing {}", rs);
+        }
+      }
+      return true;
+    }
+
+    private List<RegionInfo> getReassigns() {
+      return this.reassigns;
+    }
   }
 }
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
index 08b89b0..804d264 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
@@ -3651,8 +3651,7 @@ public class HBaseTestingUtility extends HBaseZKTestingUtility {
                       return false;
                     }
                   }
-                  if (RegionStateStore.getRegionState(r,
-                    info.getReplicaId(), info) != RegionState.State.OPEN) {
+                  if (RegionStateStore.getRegionState(r, info) != RegionState.State.OPEN) {
                     return false;
                   }
                 }
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java
index 9129cc5..a74106c 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java
@@ -112,6 +112,7 @@ public class TestHBCKSCP extends TestSCPBase {
     master.getServerManager().moveFromOnlineToDeadServers(rsServerName);
     master.getServerManager().getDeadServers().finish(rsServerName);
     master.getServerManager().getDeadServers().removeDeadServer(rsServerName);
+    master.getAssignmentManager().getRegionStates().removeServer(rsServerName);
     // Kill the server. Nothing should happen since an 'Unknown Server' as far
     // as the Master is concerned; i.e. no SCP.
     LOG.info("Killing {}", rsServerName);