You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2020/01/02 22:32:57 UTC
[hbase] branch branch-2.2 updated: HBASE-23596
HBCKServerCrashProcedure can double assign
This is an automated email from the ASF dual-hosted git repository.
stack pushed a commit to branch branch-2.2
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.2 by this push:
new 189a3b1 HBASE-23596 HBCKServerCrashProcedure can double assign
189a3b1 is described below
commit 189a3b1cf6ce832ce7aa832fffa25c7179f5683e
Author: stack <st...@apache.org>
AuthorDate: Wed Dec 18 23:08:24 2019 -0800
HBASE-23596 HBCKServerCrashProcedure can double assign
Signed-off-by: Duo Zhang <zh...@apache.org>
Signed-off-by: Lijin Bin <bi...@apache.org>
Signed-off-by: Viraj Jasani <vj...@apache.org>
Change its behavior so it will only look in hbase:meta
if the call to the super class turns up zero references.
Only then will it search hbase:meta for references to
'Unknown Servers'. Normal operation where we read Master
context is usual and sufficient. The scan of hbase:meta
is only for case where Master state has been corrupted
and we need to clear out 'Unknown Servers'.
---
.../org/apache/hadoop/hbase/MetaTableAccessor.java | 13 ++-
.../hbase/master/assignment/RegionStateStore.java | 10 +-
.../master/procedure/HBCKServerCrashProcedure.java | 120 +++++++++++++++++----
.../apache/hadoop/hbase/HBaseTestingUtility.java | 3 +-
.../hadoop/hbase/master/procedure/TestHBCKSCP.java | 1 +
5 files changed, 116 insertions(+), 31 deletions(-)
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
index d9dc779..0e41146 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
@@ -1448,7 +1448,7 @@ public class MetaTableAccessor {
}
}
- private static void addRegionStateToPut(Put put, RegionState.State state) throws IOException {
+ private static Put addRegionStateToPut(Put put, RegionState.State state) throws IOException {
put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY)
.setRow(put.getRow())
.setFamily(HConstants.CATALOG_FAMILY)
@@ -1457,6 +1457,17 @@ public class MetaTableAccessor {
.setType(Cell.Type.Put)
.setValue(Bytes.toBytes(state.name()))
.build());
+ return put;
+ }
+
+ /**
+ * Update state column in hbase:meta.
+ */
+ public static void updateRegionState(Connection connection, RegionInfo ri,
+ RegionState.State state) throws IOException {
+ Put put = new Put(RegionReplicaUtil.getRegionInfoForDefaultReplica(ri).getRegionName());
+ MetaTableAccessor.putsToMetaTable(connection,
+ Collections.singletonList(addRegionStateToPut(put, state)));
}
/**
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java
index df7f1bb..5ee22c3 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java
@@ -51,7 +51,6 @@ import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
/**
@@ -115,7 +114,7 @@ public class RegionStateStore {
if (regionInfo == null) continue;
final int replicaId = regionInfo.getReplicaId();
- final State state = getRegionState(result, replicaId, regionInfo);
+ final State state = getRegionState(result, regionInfo);
final ServerName lastHost = hrl.getServerName();
final ServerName regionLocation = getRegionServer(result, replicaId);
@@ -326,12 +325,11 @@ public class RegionStateStore {
/**
* Pull the region state from a catalog table {@link Result}.
- * @param r Result to pull the region state from
* @return the region state, or null if unknown.
*/
- @VisibleForTesting
- public static State getRegionState(final Result r, int replicaId, RegionInfo regionInfo) {
- Cell cell = r.getColumnLatestCell(HConstants.CATALOG_FAMILY, getStateColumn(replicaId));
+ public static State getRegionState(final Result r, RegionInfo regionInfo) {
+ Cell cell = r.getColumnLatestCell(HConstants.CATALOG_FAMILY,
+ getStateColumn(regionInfo.getReplicaId()));
if (cell == null || cell.getValueLength() == 0) {
return null;
}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java
index 874dcde19..eec820c 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java
@@ -20,21 +20,31 @@ package org.apache.hadoop.hbase.master.procedure;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import java.util.stream.Collectors;
+import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.MetaTableAccessor;
+import org.apache.hadoop.hbase.RegionLocations;
import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.RegionInfo;
-import org.apache.hadoop.hbase.util.Pair;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.master.RegionState;
+import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * A SCP that differs from default only in how it gets the list of
- * Regions hosted on the crashed-server; it also reads hbase:meta directly rather
- * than rely solely on Master memory for list of Regions that were on crashed server.
- * This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's
- * scheduleRecoveries). It is for the case where meta has references to 'Unknown Servers',
+ * Acts like the super class in all cases except when no Regions found in the
+ * current Master in-memory context. In this latter case, when the call to
+ * super#getRegionsOnCrashedServer returns nothing, this SCP will scan
+ * hbase:meta for references to the passed ServerName. If any found, we'll
+ * clean them up.
+ *
+ * <p>This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's
+ * scheduleRecoveries); the super class is used during normal recovery operations.
+ * It is for the case where meta has references to 'Unknown Servers',
* servers that are in hbase:meta but not in live-server or dead-server lists; i.e. Master
* and hbase:meta content have deviated. It should never happen in normal running
* cluster but if we do drop accounting of servers, we need a means of fix-up.
@@ -65,31 +75,97 @@ public class HBCKServerCrashProcedure extends ServerCrashProcedure {
public HBCKServerCrashProcedure() {}
/**
- * Adds Regions found by super method any found scanning hbase:meta.
+ * If no Regions found in Master context, then we will search hbase:meta for references
+ * to the passed server. Operator may have passed ServerName because they have found
+ * references to 'Unknown Servers'. They are using HBCKSCP to clear them out.
*/
@Override
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NP_NULL_ON_SOME_PATH_EXCEPTION",
justification="FindBugs seems confused on ps in below.")
List<RegionInfo> getRegionsOnCrashedServer(MasterProcedureEnv env) {
- // Super can return immutable emptyList.
+ // Super will return an immutable list (empty if nothing on this server).
List<RegionInfo> ris = super.getRegionsOnCrashedServer(env);
- List<Pair<RegionInfo, ServerName>> ps = null;
+ if (!ris.isEmpty()) {
+ return ris;
+ }
+ // Nothing in in-master context. Check for Unknown Server! in hbase:meta.
+ // If super list is empty, then allow that an operator scheduled an SCP because they are trying
+ // to purge 'Unknown Servers' -- servers that are neither online nor in dead servers
+ // list but that ARE in hbase:meta and so showing as unknown in places like 'HBCK Report'.
+ // This mis-accounting does not happen in normal circumstance but may arise in-extremis
+ // when cluster has been damaged in operation.
+ UnknownServerVisitor visitor =
+ new UnknownServerVisitor(env.getMasterServices().getConnection(), getServerName());
try {
- ps = MetaTableAccessor.getTableRegionsAndLocations(env.getMasterServices().getConnection(),
- null, false);
+ MetaTableAccessor.scanMetaForTableRegions(env.getMasterServices().getConnection(),
+ visitor, null);
} catch (IOException ioe) {
- LOG.warn("Failed get of all regions; continuing", ioe);
- }
- if (ps == null || ps.isEmpty()) {
- LOG.warn("No regions found in hbase:meta");
+ LOG.warn("Failed scan of hbase:meta for 'Unknown Servers'", ioe);
return ris;
}
- List<RegionInfo> aggregate = ris == null || ris.isEmpty()?
- new ArrayList<>(): new ArrayList<>(ris);
- int before = aggregate.size();
- ps.stream().filter(p -> p.getSecond() != null && p.getSecond().equals(getServerName())).
- forEach(p -> aggregate.add(p.getFirst()));
- LOG.info("Found {} mentions of {} in hbase:meta", aggregate.size() - before, getServerName());
- return aggregate;
+ LOG.info("Found {} mentions of {} in hbase:meta of OPEN/OPENING Regions: {}",
+ visitor.getReassigns().size(), getServerName(),
+ visitor.getReassigns().stream().map(RegionInfo::getEncodedName).
+ collect(Collectors.joining(",")));
+ return visitor.getReassigns();
+ }
+
+ /**
+ * Visitor for hbase:meta that 'fixes' Unknown Server issues. Collects
+ * a List of Regions to reassign as 'result'.
+ */
+ private static class UnknownServerVisitor implements MetaTableAccessor.Visitor {
+ private final List<RegionInfo> reassigns = new ArrayList<>();
+ private final ServerName unknownServerName;
+ private final Connection connection;
+
+ private UnknownServerVisitor(Connection connection, ServerName unknownServerName) {
+ this.connection = connection;
+ this.unknownServerName = unknownServerName;
+ }
+
+ @Override
+ public boolean visit(Result result) throws IOException {
+ RegionLocations rls = MetaTableAccessor.getRegionLocations(result);
+ if (rls == null) {
+ return true;
+ }
+ for (HRegionLocation hrl: rls.getRegionLocations()) {
+ if (hrl == null) {
+ continue;
+ }
+ if (hrl.getRegion() == null) {
+ continue;
+ }
+ if (hrl.getServerName() == null) {
+ continue;
+ }
+ if (!hrl.getServerName().equals(this.unknownServerName)) {
+ continue;
+ }
+ RegionState.State state = RegionStateStore.getRegionState(result, hrl.getRegion());
+ RegionState rs = new RegionState(hrl.getRegion(), state, hrl.getServerName());
+ if (rs.isClosing()) {
+ // Move region to CLOSED in hbase:meta.
+ LOG.info("Moving {} from CLOSING to CLOSED in hbase:meta",
+ hrl.getRegion().getRegionNameAsString());
+ try {
+ MetaTableAccessor.updateRegionState(this.connection, hrl.getRegion(),
+ RegionState.State.CLOSED);
+ } catch (IOException ioe) {
+ LOG.warn("Failed moving {} from CLOSING to CLOSED", ioe);
+ }
+ } else if (rs.isOpening() || rs.isOpened()) {
+ this.reassigns.add(hrl.getRegion());
+ } else {
+ LOG.info("Passing {}", rs);
+ }
+ }
+ return true;
+ }
+
+ private List<RegionInfo> getReassigns() {
+ return this.reassigns;
+ }
}
}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
index 08b89b0..804d264 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
@@ -3651,8 +3651,7 @@ public class HBaseTestingUtility extends HBaseZKTestingUtility {
return false;
}
}
- if (RegionStateStore.getRegionState(r,
- info.getReplicaId(), info) != RegionState.State.OPEN) {
+ if (RegionStateStore.getRegionState(r, info) != RegionState.State.OPEN) {
return false;
}
}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java
index 9129cc5..a74106c 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java
@@ -112,6 +112,7 @@ public class TestHBCKSCP extends TestSCPBase {
master.getServerManager().moveFromOnlineToDeadServers(rsServerName);
master.getServerManager().getDeadServers().finish(rsServerName);
master.getServerManager().getDeadServers().removeDeadServer(rsServerName);
+ master.getAssignmentManager().getRegionStates().removeServer(rsServerName);
// Kill the server. Nothing should happen since an 'Unknown Server' as far
// as the Master is concerned; i.e. no SCP.
LOG.info("Killing {}", rsServerName);