You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ap...@apache.org on 2015/08/13 19:57:20 UTC
hbase git commit: HBASE-13730 Backport HBASE-13576 (Failure in
checking one region should not fail the entire HBCK operation)
Repository: hbase
Updated Branches:
refs/heads/0.98 8b9ed1c23 -> f11472be9
HBASE-13730 Backport HBASE-13576 (Failure in checking one region should not fail the entire HBCK operation)
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/f11472be
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/f11472be
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/f11472be
Branch: refs/heads/0.98
Commit: f11472be9645abb4b725a081974abbaf78b83d99
Parents: 8b9ed1c
Author: Andrew Purtell <ap...@apache.org>
Authored: Wed Aug 12 17:46:59 2015 -0700
Committer: Andrew Purtell <ap...@apache.org>
Committed: Thu Aug 13 10:51:09 2015 -0700
----------------------------------------------------------------------
.../org/apache/hadoop/hbase/util/HBaseFsck.java | 67 +++++++++++++++++---
.../hadoop/hbase/util/HBaseFsckRepair.java | 4 +-
.../apache/hadoop/hbase/util/TestHBaseFsck.java | 1 +
3 files changed, 61 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hbase/blob/f11472be/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java
index eef9dac..4f795ce 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java
@@ -95,7 +95,6 @@ import org.apache.hadoop.hbase.master.MasterFileSystem;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService.BlockingInterface;
-import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
import org.apache.hadoop.hbase.regionserver.StoreFileInfo;
@@ -121,7 +120,6 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.zookeeper.KeeperException;
-import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
@@ -199,7 +197,7 @@ public class HBaseFsck extends Configured {
private HTable meta;
// threads to do ||izable tasks: retrieve data from regionservers, handle overlapping regions
protected ExecutorService executor;
- private long startMillis = System.currentTimeMillis();
+ private long startMillis = EnvironmentEdgeManager.currentTimeMillis();
private HFileCorruptionChecker hfcc;
private int retcode = 0;
private Path HBCK_LOCK_PATH;
@@ -281,6 +279,8 @@ public class HBaseFsck extends Configured {
private Map<TableName, Set<String>> orphanTableDirs =
new HashMap<TableName, Set<String>>();
+ private Map<TableName, Set<String>> skippedRegions = new HashMap<TableName, Set<String>>();
+
/**
* List of orphaned table ZNodes
*/
@@ -468,6 +468,7 @@ public class HBaseFsck extends Configured {
errors.clear();
tablesInfo.clear();
orphanHdfsDirs.clear();
+ skippedRegions.clear();
}
/**
@@ -1691,8 +1692,19 @@ public class HBaseFsck extends Configured {
workItems.add(new CheckRegionConsistencyWorkItem(e.getKey(), e.getValue()));
}
checkRegionConsistencyConcurrently(workItems);
+
+ // If some regions is skipped during checkRegionConsistencyConcurrently() phase, we might
+ // not get accurate state of the hbase if continuing. The config here allows users to tune
+ // the tolerance of number of skipped region.
+ // TODO: evaluate the consequence to continue the hbck operation without config.
+ int terminateThreshold = getConf().getInt("hbase.hbck.skipped.regions.limit", 0);
+ int numOfSkippedRegions = skippedRegions.size();
+ if (numOfSkippedRegions > 0 && numOfSkippedRegions > terminateThreshold) {
+ throw new IOException(numOfSkippedRegions
+ + " region(s) could not be checked or repaired. See logs for detail.");
+ }
}
-
+
/**
* Check consistency of all regions using mulitple threads concurrently.
*/
@@ -1733,11 +1745,32 @@ public class HBaseFsck extends Configured {
@Override
public synchronized Void call() throws Exception {
- checkRegionConsistency(key, hbi);
+ try {
+ checkRegionConsistency(key, hbi);
+ } catch (Exception e) {
+ // If the region is non-META region, skip this region and send warning/error message; if
+ // the region is META region, we should not continue.
+ LOG.warn("Unable to complete check or repair the region '" + hbi.getRegionNameAsString()
+ + "'.", e);
+ if (hbi.getHdfsHRI().isMetaRegion()) {
+ throw e;
+ }
+ LOG.warn("Skip region '" + hbi.getRegionNameAsString() + "'");
+ addSkippedRegion(hbi);
+ }
return null;
}
}
+ private void addSkippedRegion(final HbckInfo hbi) {
+ Set<String> skippedRegionNames = skippedRegions.get(hbi.getTableName());
+ if (skippedRegionNames == null) {
+ skippedRegionNames = new HashSet<String>();
+ }
+ skippedRegionNames.add(hbi.getRegionNameAsString());
+ skippedRegions.put(hbi.getTableName(), skippedRegionNames);
+ }
+
private void preCheckPermission() throws IOException, AccessDeniedException {
if (shouldIgnorePreCheckPermission()) {
return;
@@ -1931,7 +1964,7 @@ public class HBaseFsck extends Configured {
(hbi.metaEntry == null)? false: hbi.metaEntry.isSplit() && hbi.metaEntry.isOffline();
boolean shouldBeDeployed = inMeta && !isTableDisabled(hbi.metaEntry);
boolean recentlyModified = inHdfs &&
- hbi.getModTime() + timelag > System.currentTimeMillis();
+ hbi.getModTime() + timelag > EnvironmentEdgeManager.currentTimeMillis();
// ========== First the healthy cases =============
if (hbi.containsOnlyHdfsEdits()) {
@@ -2922,7 +2955,7 @@ public class HBaseFsck extends Configured {
*/
HTableDescriptor[] getTables(AtomicInteger numSkipped) {
List<TableName> tableNames = new ArrayList<TableName>();
- long now = System.currentTimeMillis();
+ long now = EnvironmentEdgeManager.currentTimeMillis();
for (HbckInfo hbi : regionInfoMap.values()) {
MetaEntry info = hbi.metaEntry;
@@ -3435,14 +3468,30 @@ public class HBaseFsck extends Configured {
*/
private void printTableSummary(SortedMap<TableName, TableInfo> tablesInfo) {
StringBuilder sb = new StringBuilder();
+ int numOfSkippedRegions;
errors.print("Summary:");
for (TableInfo tInfo : tablesInfo.values()) {
+ numOfSkippedRegions = (skippedRegions.containsKey(tInfo.getName())) ?
+ skippedRegions.get(tInfo.getName()).size() : 0;
+
if (errors.tableHasErrors(tInfo)) {
errors.print("Table " + tInfo.getName() + " is inconsistent.");
- } else {
- errors.print(" " + tInfo.getName() + " is okay.");
+ } else if (numOfSkippedRegions > 0){
+ errors.print("Table " + tInfo.getName() + " is okay (with "
+ + numOfSkippedRegions + " skipped regions).");
+ }
+ else {
+ errors.print("Table " + tInfo.getName() + " is okay.");
}
errors.print(" Number of regions: " + tInfo.getNumRegions());
+ if (numOfSkippedRegions > 0) {
+ Set<String> skippedRegionStrings = skippedRegions.get(tInfo.getName());
+ System.out.println(" Number of skipped regions: " + numOfSkippedRegions);
+ System.out.println(" List of skipped regions:");
+ for(String sr : skippedRegionStrings) {
+ System.out.println(" " + sr);
+ }
+ }
sb.setLength(0); // clear out existing buffer, if any.
sb.append(" Deployed on: ");
for (ServerName server : tInfo.deployedOn) {
http://git-wip-us.apache.org/repos/asf/hbase/blob/f11472be/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java
index 25b9673..e12894d 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java
@@ -117,8 +117,8 @@ public class HBaseFsckRepair {
public static void waitUntilAssigned(HBaseAdmin admin,
HRegionInfo region) throws IOException, InterruptedException {
long timeout = admin.getConfiguration().getLong("hbase.hbck.assign.timeout", 120000);
- long expiration = timeout + System.currentTimeMillis();
- while (System.currentTimeMillis() < expiration) {
+ long expiration = timeout + EnvironmentEdgeManager.currentTimeMillis();
+ while (EnvironmentEdgeManager.currentTimeMillis() < expiration) {
try {
Map<String, RegionState> rits=
admin.getClusterStatus().getRegionsInTransition();
http://git-wip-us.apache.org/repos/asf/hbase/blob/f11472be/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java
index 1fab830..c9d49e8 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java
@@ -94,6 +94,7 @@ import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.regionserver.SplitTransaction;
import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
+import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;