You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ns...@apache.org on 2011/10/11 04:01:41 UTC

svn commit: r1181358 - in /hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase: client/ ipc/ master/ regionserver/

Author: nspiegelberg
Date: Tue Oct 11 02:01:41 2011
New Revision: 1181358

URL: http://svn.apache.org/viewvc?rev=1181358&view=rev
Log:
HBase FSCK with -fix option

Summary:
Added -fix option to HBase FSCK to try to correct some of the errors it finds.

Test Plan:
Tested on the local dev server

DiffCamp Revision: 148220
Reviewed By: kranganathan
CC: davidrecordon, achao, kranganathan
Revert Plan:
OK

Added:
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsckRepair.java
Modified:
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsck.java
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsck.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsck.java?rev=1181358&r1=1181357&r2=1181358&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsck.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsck.java Tue Oct 11 02:01:41 2011
@@ -69,7 +69,8 @@ public class HBaseFsck {
 
   private boolean details = false; // do we display the full report?
   private long timelag = DEFAULT_TIME_LAG; // tables whose modtime is older
-
+  private boolean fix = false; // do we want to try fixing the errors?
+  private boolean rerun = false; // if we tried to fix something rerun hbck
 
   /**
    * Constructor
@@ -255,7 +256,7 @@ public class HBaseFsck {
   /**
    * Check consistency of all regions that have been found in previous phases.
    */
-  void checkConsistency() {
+  void checkConsistency() throws IOException {
     for (HbckInfo hbi : regionInfo.values()) {
       doConsistencyCheck(hbi);
     }
@@ -264,19 +265,20 @@ public class HBaseFsck {
   /**
    * Check a single region for consistency and correct deployment.
    */
-  void doConsistencyCheck(HbckInfo hbi) {
+  void doConsistencyCheck(HbckInfo hbi) throws IOException {
     String descriptiveName = hbi.toString();
 
     boolean inMeta = hbi.metaEntry != null;
     boolean inHdfs = hbi.foundRegionDir != null;
+    boolean hasMetaAssignment = inMeta && hbi.metaEntry.regionServer != null;
     boolean isDeployed = !hbi.deployedOn.isEmpty();
     boolean isMultiplyDeployed = hbi.deployedOn.size() > 1;
     boolean deploymentMatchesMeta =
-      inMeta && isDeployed && !isMultiplyDeployed &&
+      hasMetaAssignment && isDeployed && !isMultiplyDeployed &&
       hbi.metaEntry.regionServer.equals(hbi.deployedOn.get(0));
     boolean shouldBeDeployed = inMeta && !hbi.metaEntry.isOffline();
     boolean recentlyModified = hbi.foundRegionDir != null &&
-      hbi.foundRegionDir.getModificationTime() + timelag < System.currentTimeMillis();
+      hbi.foundRegionDir.getModificationTime() + timelag > System.currentTimeMillis();
 
     // ========== First the healthy cases =============
     if (inMeta && inHdfs && isDeployed && deploymentMatchesMeta && shouldBeDeployed) {
@@ -313,6 +315,12 @@ public class HBaseFsck {
         "and deployed on " + Joiner.on(", ").join(hbi.deployedOn));
     } else if (inMeta && inHdfs && !isDeployed) {
       errors.reportError("Region " + descriptiveName + " not deployed on any region server.");
+      // If we are trying to fix the errors
+      if (shouldFix()) {
+        System.out.println("Trying to fix unassigned region...");
+        setShouldRerun();
+        HBaseFsckRepair.fixUnassigned(this.conf, hbi.metaEntry);
+      }
     } else if (inMeta && inHdfs && isDeployed && !shouldBeDeployed) {
       errors.reportError("Region " + descriptiveName + " has should not be deployed according " +
         "to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn));
@@ -320,10 +328,22 @@ public class HBaseFsck {
       errors.reportError("Region " + descriptiveName + " is listed in META on region server " +
         hbi.metaEntry.regionServer + " but is multiply assigned to region servers " +
         Joiner.on(", ").join(hbi.deployedOn));
+      // If we are trying to fix the errors
+      if (shouldFix()) {
+        System.out.println("Trying to fix assignment error...");
+        setShouldRerun();
+        HBaseFsckRepair.fixDupeAssignment(this.conf, hbi.metaEntry, hbi.deployedOn);
+      }
     } else if (inMeta && inHdfs && isDeployed && !deploymentMatchesMeta) {
       errors.reportError("Region " + descriptiveName + " listed in META on region server " +
         hbi.metaEntry.regionServer + " but found on region server " +
         hbi.deployedOn.get(0));
+      // If we are trying to fix the errors
+      if (shouldFix()) {
+        System.out.println("Trying to fix assignment error...");
+        setShouldRerun();
+        HBaseFsckRepair.fixDupeAssignment(this.conf, hbi.metaEntry, hbi.deployedOn);
+      }
     } else {
       errors.reportError("Region " + descriptiveName + " is in an unforeseen state:" +
         " inMeta=" + inMeta +
@@ -513,7 +533,7 @@ public class HBaseFsck {
         System.out.println("\nRest easy, buddy! HBase is clean. ");
         return 0;
       } else {
-        System.out.println("\nInconsistencies detected.");
+        System.out.println("\n" + Integer.toString(errorCount) + " inconsistencies detected.");
         return -1;
       }
     }
@@ -540,6 +560,32 @@ public class HBaseFsck {
   }
 
   /**
+   * Check if we should rerun fsck again. This checks if we've tried to fix
+   * something and we should rerun fsck tool again.
+   * Display the full report from fsck. This displays all live and dead region servers ,
+   * and all known regions.
+   */
+  void setShouldRerun() {
+    rerun = true;
+  }
+
+  boolean shouldRerun() {
+    return rerun;
+  }
+
+  /**
+   * Fix inconsistencies found by fsck. This should try to fix errors (if any)
+   * found by fsck utility.
+   */
+  void setFixErrors() {
+    fix = true;
+  }
+
+  boolean shouldFix() {
+    return fix;
+  }
+
+  /**
    * We are interested in only those tables that have not changed their state in
    * META during the last few seconds specified by hbase.admin.fsck.timelag
    * @param seconds - the time in seconds
@@ -588,6 +634,8 @@ public class HBaseFsck {
           printUsageAndExit();
         }
         i++;
+      } else if (cmd.equals("-fix")) {
+        fsck.setFixErrors();
       } else {
         String str = "Unknown command line option : " + cmd;
         LOG.info(str);
@@ -597,6 +645,14 @@ public class HBaseFsck {
     }
     // do the real work of fsck
     int code = fsck.doWork();
+    // If we have changed the HBase state it is better to run fsck again
+    // to see if we haven't broken something else in the process.
+    // We run it only once more because otherwise we can easily fall into
+    // an infinite loop.
+    if (fsck.shouldRerun()) {
+      code = fsck.doWork();
+    }
+
     Runtime.getRuntime().exit(code);
   }
 }

Added: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsckRepair.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsckRepair.java?rev=1181358&view=auto
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsckRepair.java (added)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsckRepair.java Tue Oct 11 02:01:41 2011
@@ -0,0 +1,104 @@
+/**
+ * Copyright 2010 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.client;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.HServerAddress;
+import org.apache.hadoop.hbase.ipc.HMasterInterface;
+import org.apache.hadoop.hbase.ipc.HRegionInterface;
+import org.apache.hadoop.hbase.zookeeper.ZooKeeperWrapper;
+import org.apache.zookeeper.KeeperException;
+
+public class HBaseFsckRepair {
+
+  public static void fixDupeAssignment(Configuration conf, HRegionInfo region,
+      List<HServerAddress> servers)
+  throws IOException {
+
+    HRegionInfo actualRegion = new HRegionInfo(region);
+
+    // Clear status in master and zk
+    clearInMaster(conf, actualRegion);
+    clearInZK(conf, actualRegion);
+
+    // Close region on the servers
+    for(HServerAddress server : servers) {
+      closeRegion(conf, server, actualRegion);
+    }
+
+    // It's unassigned so fix it as such
+    fixUnassigned(conf, actualRegion);
+  }
+
+  public static void fixUnassigned(Configuration conf, HRegionInfo region)
+  throws IOException {
+
+    HRegionInfo actualRegion = new HRegionInfo(region);
+
+    // Clear status in master and zk
+    clearInMaster(conf, actualRegion);
+    clearInZK(conf, actualRegion);
+
+    // Clear assignment in META
+    clearMetaAssignment(conf, actualRegion);
+  }
+
+  private static void clearInMaster(Configuration conf, HRegionInfo region)
+  throws IOException {
+    System.out.println("Region being cleared in master: " + region);
+    HMasterInterface master = HConnectionManager.getConnection(conf).getMaster();
+    long masterVersion =
+      master.getProtocolVersion("org.apache.hadoop.hbase.ipc.HMasterInterface", 25);
+    System.out.println("Master protocol version: " + masterVersion);
+    master.clearFromTransition(region);
+  }
+
+  private static void clearInZK(Configuration conf, HRegionInfo region)
+  throws IOException {
+    ZooKeeperWrapper zkw = HConnectionManager.getConnection(conf).getZooKeeperWrapper();
+//    try {
+      zkw.deleteUnassignedRegion(region.getEncodedName());
+//    } catch(KeeperException ke) {}
+  }
+
+  private static void closeRegion(Configuration conf, HServerAddress server,
+      HRegionInfo region)
+  throws IOException {
+    HRegionInterface rs =
+      HConnectionManager.getConnection(conf).getHRegionConnection(server);
+    rs.closeRegion(region, false);
+  }
+
+  private static void clearMetaAssignment(Configuration conf,
+      HRegionInfo region)
+  throws IOException {
+    HTable ht = new HTable(conf, HConstants.META_TABLE_NAME);
+    Delete del = new Delete(region.getRegionName());
+    del.deleteColumns(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
+    del.deleteColumns(HConstants.CATALOG_FAMILY,
+        HConstants.STARTCODE_QUALIFIER);
+    ht.delete(del);
+  }
+}

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java?rev=1181358&r1=1181357&r2=1181358&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HBaseRPCProtocolVersion.java Tue Oct 11 02:01:41 2011
@@ -76,7 +76,8 @@ public interface HBaseRPCProtocolVersion
    * <li>Version 22: HBASE-2209. Added List support to RPC</li>
    * <li>Version 23: HBASE-2066, multi-put.</li>
    * <li>Version 24: HBASE-2473, create table with regions.</li>
+   * <li>Version 25: HBCK changes to master and RS
    * </ul>
    */
-  public static final long versionID = 24L;
+  public static final long versionID = 25L;
 }

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java?rev=1181358&r1=1181357&r2=1181358&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HMasterInterface.java Tue Oct 11 02:01:41 2011
@@ -22,6 +22,7 @@ package org.apache.hadoop.hbase.ipc;
 import org.apache.hadoop.hbase.ClusterStatus;
 import org.apache.hadoop.hbase.HColumnDescriptor;
 import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.HRegionInfo;
 import org.apache.hadoop.hbase.HTableDescriptor;
 import org.apache.hadoop.io.Writable;
 
@@ -128,4 +129,10 @@ public interface HMasterInterface extend
    * @return status object
    */
   public ClusterStatus getClusterStatus();
+
+  /**
+   * Clears the specified region from being in transition.  Used by HBaseFsck.
+   * @param region region to clear from transition map
+   */
+  public void clearFromTransition(HRegionInfo region);
 }

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java?rev=1181358&r1=1181357&r2=1181358&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java Tue Oct 11 02:01:41 2011
@@ -298,4 +298,12 @@ public interface HRegionInterface extend
    */
   public void replicateLogEntries(HLog.Entry[] entries) throws IOException;
 
+  /**
+   * Closes the specified region.
+   * @param hri region to be closed
+   * @param reportWhenCompleted whether to report to master
+   * @throws IOException
+   */
+  public void closeRegion(final HRegionInfo hri, final boolean reportWhenCompleted)
+  throws IOException;
 }

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1181358&r1=1181357&r2=1181358&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/HMaster.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Tue Oct 11 02:01:41 2011
@@ -271,7 +271,7 @@ public class HMaster extends Thread impl
     this.metrics = new MasterMetrics(MASTER);
     // We're almost open for business
     this.closed.set(false);
-    LOG.info("HMaster initialized on " + this.address.toString());
+    LOG.info("HMaster w/ hbck initialized on " + this.address.toString());
   }
 
   /**
@@ -1401,4 +1401,10 @@ public class HMaster extends Thread impl
   public static void main(String [] args) {
     doMain(args, HMaster.class);
   }
+
+  @Override
+  public void clearFromTransition(HRegionInfo region) {
+    this.regionManager.clearFromInTransition(region.getRegionName());
+    LOG.info("Cleared region " + region + " from transition map");
+  }
 }

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1181358&r1=1181357&r2=1181358&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Tue Oct 11 02:01:41 2011
@@ -1510,7 +1510,7 @@ public class HRegionServer implements HR
     getOutboundMsgs().add(new HMsg(HMsg.Type.MSG_REPORT_PROCESS_OPEN, hri));
   }
 
-  protected void closeRegion(final HRegionInfo hri, final boolean reportWhenCompleted)
+  public void closeRegion(final HRegionInfo hri, final boolean reportWhenCompleted)
   throws IOException {
     RSZookeeperUpdater zkUpdater = null;
     if(reportWhenCompleted) {