You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ns...@apache.org on 2011/10/11 04:01:41 UTC
svn commit: r1181358 - in
/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase: client/ ipc/
master/ regionserver/
Author: nspiegelberg
Date: Tue Oct 11 02:01:41 2011
New Revision: 1181358
URL: http://svn.apache.org/viewvc?rev=1181358&view=rev
Log:
HBase FSCK with -fix option
Summary:
Added -fix option to HBase FSCK to try to correct some of the errors it finds.
Test Plan:
Tested on the local dev server
DiffCamp Revision: 148220
Reviewed By: kranganathan
CC: davidrecordon, achao, kranganathan
Revert Plan:
OK
Added:
hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsckRepair.java
Modified:
hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsck.java
hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java
hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java
hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java
hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsck.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsck.java?rev=1181358&r1=1181357&r2=1181358&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsck.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsck.java Tue Oct 11 02:01:41 2011
@@ -69,7 +69,8 @@ public class HBaseFsck {
private boolean details = false; // do we display the full report?
private long timelag = DEFAULT_TIME_LAG; // tables whose modtime is older
-
+ private boolean fix = false; // do we want to try fixing the errors?
+ private boolean rerun = false; // if we tried to fix something rerun hbck
/**
* Constructor
@@ -255,7 +256,7 @@ public class HBaseFsck {
/**
* Check consistency of all regions that have been found in previous phases.
*/
- void checkConsistency() {
+ void checkConsistency() throws IOException {
for (HbckInfo hbi : regionInfo.values()) {
doConsistencyCheck(hbi);
}
@@ -264,19 +265,20 @@ public class HBaseFsck {
/**
* Check a single region for consistency and correct deployment.
*/
- void doConsistencyCheck(HbckInfo hbi) {
+ void doConsistencyCheck(HbckInfo hbi) throws IOException {
String descriptiveName = hbi.toString();
boolean inMeta = hbi.metaEntry != null;
boolean inHdfs = hbi.foundRegionDir != null;
+ boolean hasMetaAssignment = inMeta && hbi.metaEntry.regionServer != null;
boolean isDeployed = !hbi.deployedOn.isEmpty();
boolean isMultiplyDeployed = hbi.deployedOn.size() > 1;
boolean deploymentMatchesMeta =
- inMeta && isDeployed && !isMultiplyDeployed &&
+ hasMetaAssignment && isDeployed && !isMultiplyDeployed &&
hbi.metaEntry.regionServer.equals(hbi.deployedOn.get(0));
boolean shouldBeDeployed = inMeta && !hbi.metaEntry.isOffline();
boolean recentlyModified = hbi.foundRegionDir != null &&
- hbi.foundRegionDir.getModificationTime() + timelag < System.currentTimeMillis();
+ hbi.foundRegionDir.getModificationTime() + timelag > System.currentTimeMillis();
// ========== First the healthy cases =============
if (inMeta && inHdfs && isDeployed && deploymentMatchesMeta && shouldBeDeployed) {
@@ -313,6 +315,12 @@ public class HBaseFsck {
"and deployed on " + Joiner.on(", ").join(hbi.deployedOn));
} else if (inMeta && inHdfs && !isDeployed) {
errors.reportError("Region " + descriptiveName + " not deployed on any region server.");
+ // If we are trying to fix the errors
+ if (shouldFix()) {
+ System.out.println("Trying to fix unassigned region...");
+ setShouldRerun();
+ HBaseFsckRepair.fixUnassigned(this.conf, hbi.metaEntry);
+ }
} else if (inMeta && inHdfs && isDeployed && !shouldBeDeployed) {
errors.reportError("Region " + descriptiveName + " has should not be deployed according " +
"to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn));
@@ -320,10 +328,22 @@ public class HBaseFsck {
errors.reportError("Region " + descriptiveName + " is listed in META on region server " +
hbi.metaEntry.regionServer + " but is multiply assigned to region servers " +
Joiner.on(", ").join(hbi.deployedOn));
+ // If we are trying to fix the errors
+ if (shouldFix()) {
+ System.out.println("Trying to fix assignment error...");
+ setShouldRerun();
+ HBaseFsckRepair.fixDupeAssignment(this.conf, hbi.metaEntry, hbi.deployedOn);
+ }
} else if (inMeta && inHdfs && isDeployed && !deploymentMatchesMeta) {
errors.reportError("Region " + descriptiveName + " listed in META on region server " +
hbi.metaEntry.regionServer + " but found on region server " +
hbi.deployedOn.get(0));
+ // If we are trying to fix the errors
+ if (shouldFix()) {
+ System.out.println("Trying to fix assignment error...");
+ setShouldRerun();
+ HBaseFsckRepair.fixDupeAssignment(this.conf, hbi.metaEntry, hbi.deployedOn);
+ }
} else {
errors.reportError("Region " + descriptiveName + " is in an unforeseen state:" +
" inMeta=" + inMeta +
@@ -513,7 +533,7 @@ public class HBaseFsck {
System.out.println("\nRest easy, buddy! HBase is clean. ");
return 0;
} else {
- System.out.println("\nInconsistencies detected.");
+ System.out.println("\n" + Integer.toString(errorCount) + " inconsistencies detected.");
return -1;
}
}
@@ -540,6 +560,32 @@ public class HBaseFsck {
}
/**
+ * Check if we should rerun fsck again. This checks if we've tried to fix
+ * something and we should rerun fsck tool again.
+ * Display the full report from fsck. This displays all live and dead region servers ,
+ * and all known regions.
+ */
+ void setShouldRerun() {
+ rerun = true;
+ }
+
+ boolean shouldRerun() {
+ return rerun;
+ }
+
+ /**
+ * Fix inconsistencies found by fsck. This should try to fix errors (if any)
+ * found by fsck utility.
+ */
+ void setFixErrors() {
+ fix = true;
+ }
+
+ boolean shouldFix() {
+ return fix;
+ }
+
+ /**
* We are interested in only those tables that have not changed their state in
* META during the last few seconds specified by hbase.admin.fsck.timelag
* @param seconds - the time in seconds
@@ -588,6 +634,8 @@ public class HBaseFsck {
printUsageAndExit();
}
i++;
+ } else if (cmd.equals("-fix")) {
+ fsck.setFixErrors();
} else {
String str = "Unknown command line option : " + cmd;
LOG.info(str);
@@ -597,6 +645,14 @@ public class HBaseFsck {
}
// do the real work of fsck
int code = fsck.doWork();
+ // If we have changed the HBase state it is better to run fsck again
+ // to see if we haven't broken something else in the process.
+ // We run it only once more because otherwise we can easily fall into
+ // an infinite loop.
+ if (fsck.shouldRerun()) {
+ code = fsck.doWork();
+ }
+
Runtime.getRuntime().exit(code);
}
}
Added: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsckRepair.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsckRepair.java?rev=1181358&view=auto
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsckRepair.java (added)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsckRepair.java Tue Oct 11 02:01:41 2011
@@ -0,0 +1,104 @@
+/**
+ * Copyright 2010 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.client;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.HServerAddress;
+import org.apache.hadoop.hbase.ipc.HMasterInterface;
+import org.apache.hadoop.hbase.ipc.HRegionInterface;
+import org.apache.hadoop.hbase.zookeeper.ZooKeeperWrapper;
+import org.apache.zookeeper.KeeperException;
+
+public class HBaseFsckRepair {
+
+ public static void fixDupeAssignment(Configuration conf, HRegionInfo region,
+ List<HServerAddress> servers)
+ throws IOException {
+
+ HRegionInfo actualRegion = new HRegionInfo(region);
+
+ // Clear status in master and zk
+ clearInMaster(conf, actualRegion);
+ clearInZK(conf, actualRegion);
+
+ // Close region on the servers
+ for(HServerAddress server : servers) {
+ closeRegion(conf, server, actualRegion);
+ }
+
+ // It's unassigned so fix it as such
+ fixUnassigned(conf, actualRegion);
+ }
+
+ public static void fixUnassigned(Configuration conf, HRegionInfo region)
+ throws IOException {
+
+ HRegionInfo actualRegion = new HRegionInfo(region);
+
+ // Clear status in master and zk
+ clearInMaster(conf, actualRegion);
+ clearInZK(conf, actualRegion);
+
+ // Clear assignment in META
+ clearMetaAssignment(conf, actualRegion);
+ }
+
+ private static void clearInMaster(Configuration conf, HRegionInfo region)
+ throws IOException {
+ System.out.println("Region being cleared in master: " + region);
+ HMasterInterface master = HConnectionManager.getConnection(conf).getMaster();
+ long masterVersion =
+ master.getProtocolVersion("org.apache.hadoop.hbase.ipc.HMasterInterface", 25);
+ System.out.println("Master protocol version: " + masterVersion);
+ master.clearFromTransition(region);
+ }
+
+ private static void clearInZK(Configuration conf, HRegionInfo region)
+ throws IOException {
+ ZooKeeperWrapper zkw = HConnectionManager.getConnection(conf).getZooKeeperWrapper();
+// try {
+ zkw.deleteUnassignedRegion(region.getEncodedName());
+// } catch(KeeperException ke) {}
+ }
+
+ private static void closeRegion(Configuration conf, HServerAddress server,
+ HRegionInfo region)
+ throws IOException {
+ HRegionInterface rs =
+ HConnectionManager.getConnection(conf).getHRegionConnection(server);
+ rs.closeRegion(region, false);
+ }
+
+ private static void clearMetaAssignment(Configuration conf,
+ HRegionInfo region)
+ throws IOException {
+ HTable ht = new HTable(conf, HConstants.META_TABLE_NAME);
+ Delete del = new Delete(region.getRegionName());
+ del.deleteColumns(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
+ del.deleteColumns(HConstants.CATALOG_FAMILY,
+ HConstants.STARTCODE_QUALIFIER);
+ ht.delete(del);
+ }
+}
Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java?rev=1181358&r1=1181357&r2=1181358&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java Tue Oct 11 02:01:41 2011
@@ -76,7 +76,8 @@ public interface HBaseRPCProtocolVersion
* <li>Version 22: HBASE-2209. Added List support to RPC</li>
* <li>Version 23: HBASE-2066, multi-put.</li>
* <li>Version 24: HBASE-2473, create table with regions.</li>
+ * <li>Version 25: HBCK changes to master and RS
* </ul>
*/
- public static final long versionID = 24L;
+ public static final long versionID = 25L;
}
Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java?rev=1181358&r1=1181357&r2=1181358&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java Tue Oct 11 02:01:41 2011
@@ -22,6 +22,7 @@ package org.apache.hadoop.hbase.ipc;
import org.apache.hadoop.hbase.ClusterStatus;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.io.Writable;
@@ -128,4 +129,10 @@ public interface HMasterInterface extend
* @return status object
*/
public ClusterStatus getClusterStatus();
+
+ /**
+ * Clears the specified region from being in transition. Used by HBaseFsck.
+ * @param region region to clear from transition map
+ */
+ public void clearFromTransition(HRegionInfo region);
}
Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java?rev=1181358&r1=1181357&r2=1181358&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java Tue Oct 11 02:01:41 2011
@@ -298,4 +298,12 @@ public interface HRegionInterface extend
*/
public void replicateLogEntries(HLog.Entry[] entries) throws IOException;
+ /**
+ * Closes the specified region.
+ * @param hri region to be closed
+ * @param reportWhenCompleted whether to report to master
+ * @throws IOException
+ */
+ public void closeRegion(final HRegionInfo hri, final boolean reportWhenCompleted)
+ throws IOException;
}
Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1181358&r1=1181357&r2=1181358&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Tue Oct 11 02:01:41 2011
@@ -271,7 +271,7 @@ public class HMaster extends Thread impl
this.metrics = new MasterMetrics(MASTER);
// We're almost open for business
this.closed.set(false);
- LOG.info("HMaster initialized on " + this.address.toString());
+ LOG.info("HMaster w/ hbck initialized on " + this.address.toString());
}
/**
@@ -1401,4 +1401,10 @@ public class HMaster extends Thread impl
public static void main(String [] args) {
doMain(args, HMaster.class);
}
+
+ @Override
+ public void clearFromTransition(HRegionInfo region) {
+ this.regionManager.clearFromInTransition(region.getRegionName());
+ LOG.info("Cleared region " + region + " from transition map");
+ }
}
Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1181358&r1=1181357&r2=1181358&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Tue Oct 11 02:01:41 2011
@@ -1510,7 +1510,7 @@ public class HRegionServer implements HR
getOutboundMsgs().add(new HMsg(HMsg.Type.MSG_REPORT_PROCESS_OPEN, hri));
}
- protected void closeRegion(final HRegionInfo hri, final boolean reportWhenCompleted)
+ public void closeRegion(final HRegionInfo hri, final boolean reportWhenCompleted)
throws IOException {
RSZookeeperUpdater zkUpdater = null;
if(reportWhenCompleted) {