You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2011/05/04 21:28:45 UTC

svn commit: r1099566 - in /hbase/trunk: CHANGES.txt src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java

Author: stack
Date: Wed May  4 19:28:45 2011
New Revision: 1099566

URL: http://svn.apache.org/viewvc?rev=1099566&view=rev
Log:
HBASE-3695 Some improvements to Hbck to test the entire region chain in Meta and provide better error reporting

Modified:
    hbase/trunk/CHANGES.txt
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java
    hbase/trunk/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
    hbase/trunk/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java

Modified: hbase/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=1099566&r1=1099565&r2=1099566&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Wed May  4 19:28:45 2011
@@ -285,6 +285,8 @@ Release 0.90.3 - Unreleased
    HBASE-3580  Remove RS from DeadServer when new instance checks in
    HBASE-2470  Add Scan.setTimeRange() support in Shell (Harsh J Chouraria)
    HBASE-3805  Log RegionState that are processed too late in the master
+   HBASE-3695  Some improvements to Hbck to test the entire region chain in
+               Meta and provide better error reporting (Marc Limotte)
 
   TASKS
    HBASE-3748  Add rolling of thrift/rest daemons to graceful_stop.sh script

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java?rev=1099566&r1=1099565&r2=1099566&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java (original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java Wed May  4 19:28:45 2011
@@ -20,14 +20,7 @@
 package org.apache.hadoop.hbase.util;
 
 import java.io.IOException;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import java.util.TreeMap;
-import java.util.TreeSet;
+import java.util.*;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.ThreadPoolExecutor;
 import java.util.concurrent.TimeUnit;
@@ -69,6 +62,8 @@ import org.apache.zookeeper.KeeperExcept
 import com.google.common.base.Joiner;
 import com.google.common.collect.Lists;
 
+import static org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
+
 /**
  * Check consistency among the in-memory states of the master and the
  * region server(s) and the state of data in HDFS.
@@ -101,7 +96,7 @@ public class HBaseFsck {
   private Set<Result> emptyRegionInfoQualifiers = new HashSet<Result>();
   private int numThreads = MAX_NUM_THREADS;
 
-  ThreadPoolExecutor executor;         // threads to retrieve data from regionservers
+  ThreadPoolExecutor executor; // threads to retrieve data from regionservers
 
   /**
    * Constructor
@@ -140,20 +135,21 @@ public class HBaseFsck {
     tablesInfo.clear();
     emptyRegionInfoQualifiers.clear();
     disabledTables.clear();
+    errors.clear();
 
     // get a list of all regions from the master. This involves
     // scanning the META table
     if (!recordRootRegion()) {
       // Will remove later if we can fix it
-      errors.reportError("Encountered fatal error. Exitting...");
+      errors.reportError("Encountered fatal error. Exiting...");
       return -1;
     }
     getMetaEntries();
 
-    // Check if .META. is found only once and on the right place
+    // Check if .META. is found only once and in the right place
     if (!checkMetaEntries()) {
       // Will remove later if we can fix it
-      errors.reportError("Encountered fatal error. Exitting...");
+      errors.reportError("Encountered fatal error. Exiting...");
       return -1;
     }
 
@@ -225,6 +221,10 @@ public class HBaseFsck {
     return errors.summarize();
   }
 
+  public ErrorReporter getErrors() {
+    return errors;
+  }
+
   /**
    * Load the list of disabled tables in ZK into local set.
    * @throws ZooKeeperConnectionException
@@ -281,7 +281,8 @@ public class HBaseFsck {
 
     // verify that version file exists
     if (!foundVersionFile) {
-      errors.reportError("Version file does not exist in root dir " + rootDir);
+      errors.reportError(ERROR_CODE.NO_VERSION_FILE,
+          "Version file does not exist in root dir " + rootDir);
     }
 
     // level 1:  <HBASE_DIR>/*
@@ -315,7 +316,8 @@ public class HBaseFsck {
     // Check if Root region is valid and existing
     if (rootLocation == null || rootLocation.getRegionInfo() == null ||
         rootLocation.getHostname() == null) {
-      errors.reportError("Root Region or some of its attributes is null.");
+      errors.reportError(ERROR_CODE.NULL_ROOT_REGION,
+        "Root Region or some of its attributes are null.");
       return false;
     }
     ServerName sn;
@@ -428,7 +430,7 @@ public class HBaseFsck {
       LOG.debug("Region " + descriptiveName + " offline, ignoring.");
       return;
     } else if (recentlyModified) {
-      LOG.info("Region " + descriptiveName + " was recently modified -- skipping");
+      LOG.warn("Region " + descriptiveName + " was recently modified -- skipping");
       return;
     }
     // ========== Cases where the region is not in META =============
@@ -436,24 +438,29 @@ public class HBaseFsck {
       // We shouldn't have record of this region at all then!
       assert false : "Entry for region with no data";
     } else if (!inMeta && !inHdfs && isDeployed) {
-      errors.reportError("Region " + descriptiveName + ", key=" + key + ", not on HDFS or in META but " +
-        "deployed on " + Joiner.on(", ").join(hbi.deployedOn));
+      errors.reportError(ERROR_CODE.NOT_IN_META_HDFS, "Region "
+          + descriptiveName + ", key=" + key + ", not on HDFS or in META but " +
+          "deployed on " + Joiner.on(", ").join(hbi.deployedOn));
     } else if (!inMeta && inHdfs && !isDeployed) {
-      errors.reportError("Region " + descriptiveName + " on HDFS, but not listed in META " +
-        "or deployed on any region server.");
+      errors.reportError(ERROR_CODE.NOT_IN_META_OR_DEPLOYED, "Region "
+          + descriptiveName + " on HDFS, but not listed in META " +
+          "or deployed on any region server.");
     } else if (!inMeta && inHdfs && isDeployed) {
-      errors.reportError("Region " + descriptiveName + " not in META, but deployed on " +
-        Joiner.on(", ").join(hbi.deployedOn));
+      errors.reportError(ERROR_CODE.NOT_IN_META, "Region " + descriptiveName
+          + " not in META, but deployed on " + Joiner.on(", ").join(hbi.deployedOn));
 
     // ========== Cases where the region is in META =============
     } else if (inMeta && !inHdfs && !isDeployed) {
-      errors.reportError("Region " + descriptiveName + " found in META, but not in HDFS " +
-        "or deployed on any region server.");
+      errors.reportError(ERROR_CODE.NOT_IN_HDFS_OR_DEPLOYED, "Region "
+          + descriptiveName + " found in META, but not in HDFS "
+          + "or deployed on any region server.");
     } else if (inMeta && !inHdfs && isDeployed) {
-      errors.reportError("Region " + descriptiveName + " found in META, but not in HDFS, " +
-        "and deployed on " + Joiner.on(", ").join(hbi.deployedOn));
+      errors.reportError(ERROR_CODE.NOT_IN_HDFS, "Region " + descriptiveName
+          + " found in META, but not in HDFS, " +
+          "and deployed on " + Joiner.on(", ").join(hbi.deployedOn));
     } else if (inMeta && inHdfs && !isDeployed && shouldBeDeployed) {
-      errors.reportError("Region " + descriptiveName + " not deployed on any region server.");
+      errors.reportError(ERROR_CODE.NOT_DEPLOYED, "Region " + descriptiveName
+          + " not deployed on any region server.");
       // If we are trying to fix the errors
       if (shouldFix()) {
         errors.print("Trying to fix unassigned region...");
@@ -461,12 +468,14 @@ public class HBaseFsck {
         HBaseFsckRepair.fixUnassigned(this.conf, hbi.metaEntry);
       }
     } else if (inMeta && inHdfs && isDeployed && !shouldBeDeployed) {
-      errors.reportError("Region " + descriptiveName + " should not be deployed according " +
-        "to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn));
+      errors.reportError(ERROR_CODE.SHOULD_NOT_BE_DEPLOYED, "Region "
+          + descriptiveName + " should not be deployed according " +
+          "to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn));
     } else if (inMeta && inHdfs && isMultiplyDeployed) {
-      errors.reportError("Region " + descriptiveName + " is listed in META on region server " +
-        hbi.metaEntry.regionServer + " but is multiply assigned to region servers " +
-        Joiner.on(", ").join(hbi.deployedOn));
+      errors.reportError(ERROR_CODE.MULTI_DEPLOYED, "Region " + descriptiveName
+          + " is listed in META on region server " + hbi.metaEntry.regionServer
+          + " but is multiply assigned to region servers " +
+          Joiner.on(", ").join(hbi.deployedOn));
       // If we are trying to fix the errors
       if (shouldFix()) {
         errors.print("Trying to fix assignment error...");
@@ -474,9 +483,10 @@ public class HBaseFsck {
         HBaseFsckRepair.fixDupeAssignment(this.conf, hbi.metaEntry, hbi.deployedOn);
       }
     } else if (inMeta && inHdfs && isDeployed && !deploymentMatchesMeta) {
-      errors.reportError("Region " + descriptiveName + " listed in META on region server " +
-        hbi.metaEntry.regionServer + " but found on region server " +
-        hbi.deployedOn.get(0));
+      errors.reportError(ERROR_CODE.SERVER_DOES_NOT_MATCH_META, "Region "
+          + descriptiveName + " listed in META on region server " +
+          hbi.metaEntry.regionServer + " but found on region server " +
+          hbi.deployedOn.get(0));
       // If we are trying to fix the errors
       if (shouldFix()) {
         errors.print("Trying to fix assignment error...");
@@ -484,13 +494,14 @@ public class HBaseFsck {
         HBaseFsckRepair.fixDupeAssignment(this.conf, hbi.metaEntry, hbi.deployedOn);
       }
     } else {
-      errors.reportError("Region " + descriptiveName + " is in an unforeseen state:" +
-        " inMeta=" + inMeta +
-        " inHdfs=" + inHdfs +
-        " isDeployed=" + isDeployed +
-        " isMultiplyDeployed=" + isMultiplyDeployed +
-        " deploymentMatchesMeta=" + deploymentMatchesMeta +
-        " shouldBeDeployed=" + shouldBeDeployed);
+      errors.reportError(ERROR_CODE.UNKNOWN, "Region " + descriptiveName +
+          " is in an unforeseen state:" +
+          " inMeta=" + inMeta +
+          " inHdfs=" + inHdfs +
+          " isDeployed=" + isDeployed +
+          " isMultiplyDeployed=" + isMultiplyDeployed +
+          " deploymentMatchesMeta=" + deploymentMatchesMeta +
+          " shouldBeDeployed=" + shouldBeDeployed);
     }
   }
 
@@ -504,10 +515,15 @@ public class HBaseFsck {
       // Check only valid, working regions
       if (hbi.metaEntry == null) continue;
       if (hbi.metaEntry.regionServer == null) continue;
-      if (hbi.foundRegionDir == null) continue;
-      if (hbi.deployedOn.size() != 1) continue;
       if (hbi.onlyEdits) continue;
 
+      // Missing regionDir or over-deployment is checked elsewhere. Include
+      // these cases in modTInfo, so we can evaluate those regions as part of
+      // the region chain in META
+      //if (hbi.foundRegionDir == null) continue;
+      //if (hbi.deployedOn.size() != 1) continue;
+      if (hbi.deployedOn.size() == 0) continue;
+
       // We should be safe here
       String tableName = hbi.metaEntry.getTableDesc().getNameAsString();
       TInfo modTInfo = tablesInfo.get(tableName);
@@ -517,13 +533,16 @@ public class HBaseFsck {
       for (ServerName server : hbi.deployedOn) {
         modTInfo.addServer(server);
       }
-      modTInfo.addEdge(hbi.metaEntry.getStartKey(), hbi.metaEntry.getEndKey());
+
+      //modTInfo.addEdge(hbi.metaEntry.getStartKey(), hbi.metaEntry.getEndKey());
+      modTInfo.addRegionInfo(hbi);
+
       tablesInfo.put(tableName, modTInfo);
     }
 
     for (TInfo tInfo : tablesInfo.values()) {
-      if (!tInfo.check()) {
-        errors.reportError("Found inconsistency in table " + tInfo.getName());
+      if (!tInfo.checkRegionChain()) {
+        errors.report("Found inconsistency in table " + tInfo.getName());
       }
     }
   }
@@ -533,17 +552,16 @@ public class HBaseFsck {
    */
   private class TInfo {
     String tableName;
-    TreeMap <byte[], byte[]> edges;
     TreeSet <ServerName> deployedOn;
+    List<HbckInfo> regions = new ArrayList<HbckInfo>();
 
     TInfo(String name) {
       this.tableName = name;
-      edges = new TreeMap <byte[], byte[]> (Bytes.BYTES_COMPARATOR);
       deployedOn = new TreeSet <ServerName>();
     }
 
-    public void addEdge(byte[] fromNode, byte[] toNode) {
-      this.edges.put(fromNode, toNode);
+    public void addRegionInfo (HbckInfo r) {
+      regions.add(r);
     }
 
     public void addServer(ServerName server) {
@@ -555,46 +573,77 @@ public class HBaseFsck {
     }
 
     public int getNumRegions() {
-      return edges.size();
+      return regions.size();
     }
 
-    public boolean check() {
-      byte[] last = new byte[0];
-      byte[] next = new byte[0];
-      TreeSet <byte[]> visited = new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR);
-      // Each table should start with a zero-length byte[] and end at a
-      // zero-length byte[]. Just follow the edges to see if this is true
-      while (true) {
-        // Check if chain is broken
-        if (!edges.containsKey(last)) {
-          errors.detail("Chain of regions in table " + tableName +
-            " is broken; edges does not contain " + Bytes.toString(last));
-          return false;
-        }
-        next = edges.get(last);
-        // Found a cycle
-        if (visited.contains(next)) {
-          errors.detail("Chain of regions in table " + tableName +
-            " has a cycle around " + Bytes.toString(next));
-          return false;
-        }
-        // Mark next node as visited
-        visited.add(next);
-        // If next is zero-length byte[] we are possibly at the end of the chain
-        if (next.length == 0) {
-          // If we have visited all elements we are fine
-          if (edges.size() != visited.size()) {
-            errors.detail("Chain of regions in table " + tableName +
-              " contains less elements than are listed in META; visited=" + visited.size() +
-              ", edges=" + edges.size());
-            return false;
+    /**
+     * Check the region chain (from META) of this table.  We are looking for
+     * holes, overlaps, and cycles.
+     * @return false if there are errors
+     */
+    public boolean checkRegionChain() {
+      Collections.sort(regions);
+      HbckInfo last = null;
+
+      for (HbckInfo r : regions) {
+        if (last == null) {
+          // This is the first region, check that the start key is empty
+          if (! Bytes.equals(r.metaEntry.getStartKey(), HConstants.EMPTY_BYTE_ARRAY)) {
+            errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
+                "First region should start with an empty key.",
+                this, r);
           }
-          return true;
+        } else {
+
+          // Check if endKey < startKey
+          // Previous implementation of this code checked for a cycle in the
+          // region chain.  A cycle would imply that the endKey comes before
+          // the startKey (i.e. endKey < startKey).
+          if (! Bytes.equals(r.metaEntry.getEndKey(), HConstants.EMPTY_BYTE_ARRAY)) {
+            // continue with this check if this is not the last region
+            int cmpRegionKeys = Bytes.compareTo(r.metaEntry.getStartKey(),
+                r.metaEntry.getEndKey());
+            if (cmpRegionKeys > 0) {
+              errors.reportError(ERROR_CODE.REGION_CYCLE,
+                  String.format("The endkey for this region comes before the "
+                      + "startkey, startkey=%s, endkey=%s",
+                      Bytes.toString(r.metaEntry.getStartKey()),
+                      Bytes.toString(r.metaEntry.getEndKey())),
+                  this, r, last);
+            }
+          }
+
+          // Check if the startkeys are different
+          if (Bytes.equals(r.metaEntry.getStartKey(), last.metaEntry.getStartKey())) {
+            errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
+                "Two regions have the same startkey: "
+                    + Bytes.toString(r.metaEntry.getStartKey()),
+                this, r, last);
+          } else {
+            // Check that the startkey is the same as the previous end key
+            int cmp = Bytes.compareTo(r.metaEntry.getStartKey(),
+                last.metaEntry.getEndKey());
+            if (cmp > 0) {
+              // hole
+              errors.reportError(ERROR_CODE.HOLE_IN_REGION_CHAIN,
+                  "There is a hole in the region chain.",
+                  this, r, last);
+            } else if (cmp < 0) {
+              // overlap
+              errors.reportError(ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
+                  "There is an overlap in the region chain.",
+                  this, r, last);
+            }
+          }
+
         }
-        last = next;
+
+        last = r;
       }
-      // How did we get here?
+
+      return errors.getErrorList().size() == 0;
     }
+
   }
 
   /**
@@ -603,7 +652,6 @@ public class HBaseFsck {
    * if any of the REGIONINFO_QUALIFIER, SERVER_QUALIFIER, STARTCODE_QUALIFIER,
    * SPLITA_QUALIFIER, SPLITB_QUALIFIER have not changed in the last
    * milliseconds specified by timelag, then the table is a candidate to be returned.
-   * @param regionList - all entries found in .META
    * @return tables that have not been modified recently
    * @throws IOException if an error is encountered
    */
@@ -668,7 +716,7 @@ public class HBaseFsck {
 
       // If there is no region holding .META.
       if (metaRegions.size() == 0) {
-        errors.reportError(".META. is not found on any region.");
+        errors.reportError(ERROR_CODE.NO_META_REGION, ".META. is not found on any region.");
         if (shouldFix()) {
           errors.print("Trying to fix a problem with .META...");
           setShouldRerun();
@@ -678,7 +726,7 @@ public class HBaseFsck {
       }
       // If there are more than one regions pretending to hold the .META.
       else if (metaRegions.size() > 1) {
-        errors.reportError(".META. is found on more than one region.");
+        errors.reportError(ERROR_CODE.MULTI_META_REGION, ".META. is found on more than one region.");
         if (shouldFix()) {
           errors.print("Trying to fix a problem with .META...");
           setShouldRerun();
@@ -773,7 +821,7 @@ public class HBaseFsck {
   /**
    * Maintain information about a particular region.
    */
-  static class HbckInfo {
+  static class HbckInfo implements Comparable {
     boolean onlyEdits = false;
     MetaEntry metaEntry = null;
     FileStatus foundRegionDir = null;
@@ -796,6 +844,16 @@ public class HBaseFsck {
         return "UNKNOWN_REGION on " + Joiner.on(", ").join(deployedOn);
       }
     }
+
+    @Override
+    public int compareTo(Object o) {
+      HbckInfo other = (HbckInfo) o;
+      int startComparison = Bytes.compareTo(this.metaEntry.getStartKey(), other.metaEntry.getStartKey());
+      if (startComparison != 0)
+        return startComparison;
+      else
+        return Bytes.compareTo(this.metaEntry.getEndKey(), other.metaEntry.getEndKey());
+    }
   }
 
   /**
@@ -804,10 +862,10 @@ public class HBaseFsck {
   private void printTableSummary() {
     System.out.println("Summary:");
     for (TInfo tInfo : tablesInfo.values()) {
-      if (tInfo.check()) {
-        System.out.println("  " + tInfo.getName() + " is okay.");
-      } else {
+      if (errors.tableHasErrors(tInfo)) {
         System.out.println("Table " + tInfo.getName() + " is inconsistent.");
+      } else {
+        System.out.println("  " + tInfo.getName() + " is okay.");
       }
       System.out.println("    Number of regions: " + tInfo.getNumRegions());
       System.out.print("    Deployed on: ");
@@ -819,19 +877,45 @@ public class HBaseFsck {
   }
 
   interface ErrorReporter {
+    public static enum ERROR_CODE {
+      UNKNOWN, NO_META_REGION, NULL_ROOT_REGION, NO_VERSION_FILE, NOT_IN_META_HDFS, NOT_IN_META,
+      NOT_IN_META_OR_DEPLOYED, NOT_IN_HDFS_OR_DEPLOYED, NOT_IN_HDFS, SERVER_DOES_NOT_MATCH_META, NOT_DEPLOYED,
+      MULTI_DEPLOYED, SHOULD_NOT_BE_DEPLOYED, MULTI_META_REGION, RS_CONNECT_FAILURE,
+      FIRST_REGION_STARTKEY_NOT_EMPTY, DUPE_STARTKEYS,
+      HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE
+    }
+    public void clear();
+    public void report(String message);
     public void reportError(String message);
+    public void reportError(ERROR_CODE errorCode, String message);
+    public void reportError(ERROR_CODE errorCode, String message, TInfo table, HbckInfo info);
+    public void reportError(ERROR_CODE errorCode, String message, TInfo table, HbckInfo info1, HbckInfo info2);
     public int summarize();
     public void detail(String details);
+    public ArrayList<ERROR_CODE> getErrorList();
     public void progress();
     public void print(String message);
     public void resetErrors();
+    public boolean tableHasErrors(TInfo table);
   }
 
   private static class PrintingErrorReporter implements ErrorReporter {
     public int errorCount = 0;
     private int showProgress;
 
-    public synchronized void reportError(String message) {
+    Set<TInfo> errorTables = new HashSet<TInfo>();
+
+    // for use by unit tests to verify which errors were discovered
+    private ArrayList<ERROR_CODE> errorList = new ArrayList<ERROR_CODE>();
+
+    public void clear() {
+      errorTables.clear();
+      errorList.clear();
+      errorCount = 0;
+    }
+
+    public synchronized void reportError(ERROR_CODE errorCode, String message) {
+      errorList.add(errorCode);
       if (!summary) {
         System.out.println("ERROR: " + message);
       }
@@ -839,6 +923,37 @@ public class HBaseFsck {
       showProgress = 0;
     }
 
+    public synchronized void reportError(ERROR_CODE errorCode, String message, TInfo table,
+                                         HbckInfo info) {
+      errorTables.add(table);
+      String reference = "(region " + info.metaEntry.getRegionNameAsString() + ")";
+      reportError(errorCode, reference + " " + message);
+    }
+
+    public synchronized void reportError(ERROR_CODE errorCode, String message, TInfo table,
+                                         HbckInfo info1, HbckInfo info2) {
+      errorTables.add(table);
+      String reference = "(regions " + info1.metaEntry.getRegionNameAsString()
+          + " and " + info2.metaEntry.getRegionNameAsString() + ")";
+      reportError(errorCode, reference + " " + message);
+    }
+
+    public synchronized void reportError(String message) {
+      reportError(ERROR_CODE.UNKNOWN, message);
+    }
+
+    /**
+     * Report error information, but do not increment the error count.  Intended for cases
+     * where the actual error would have been reported previously.
+     * @param message
+     */
+    public synchronized void report(String message) {
+      if (! summary) {
+        System.out.println("ERROR: " + message);
+      }
+      showProgress = 0;
+    }
+
     public synchronized int summarize() {
       System.out.println(Integer.toString(errorCount) +
                          " inconsistencies detected.");
@@ -851,6 +966,10 @@ public class HBaseFsck {
       }
     }
 
+    public ArrayList<ERROR_CODE> getErrorList() {
+      return errorList;
+    }
+
     public synchronized void print(String message) {
       if (!summary) {
         System.out.println(message);
@@ -858,6 +977,11 @@ public class HBaseFsck {
     }
 
     @Override
+    public boolean tableHasErrors(TInfo table) {
+      return errorTables.contains(table);
+    }
+
+    @Override
     public void resetErrors() {
       errorCount = 0;
     }
@@ -923,14 +1047,14 @@ public class HBaseFsck {
           }
         }
 
-        // check to see if the existance of this region matches the region in META
+        // check to see if the existence of this region matches the region in META
         for (HRegionInfo r:regions) {
           HbckInfo hbi = hbck.getOrCreateInfo(r.getEncodedName());
           hbi.addServer(rsinfo);
         }
       } catch (IOException e) {          // unable to connect to the region server. 
-        errors.reportError("RegionServer: " + rsinfo.getServerName() +
-                      " Unable to fetch region information. " + e);
+        errors.reportError(ERROR_CODE.RS_CONNECT_FAILURE, "RegionServer: " + rsinfo.getServerName() +
+          " Unable to fetch region information. " + e);
       } finally {
         done = true;
         notifyAll(); // wakeup anybody waiting for this item to be done
@@ -1000,7 +1124,7 @@ public class HBaseFsck {
           }
         }
       } catch (IOException e) {          // unable to connect to the region server. 
-        errors.reportError("Table Directory: " + tableDir.getPath().getName() +
+        errors.reportError(ERROR_CODE.RS_CONNECT_FAILURE, "Table Directory: " + tableDir.getPath().getName() +
                       " Unable to fetch region information. " + e);
       } finally {
         done = true;

Modified: hbase/trunk/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java?rev=1099566&r1=1099565&r2=1099566&view=diff
==============================================================================
--- hbase/trunk/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java (original)
+++ hbase/trunk/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java Wed May  4 19:28:45 2011
@@ -89,6 +89,7 @@ public class HBaseTestingUtility {
    */
   private boolean passedZkCluster = false;
   private MiniDFSCluster dfsCluster = null;
+
   private MiniHBaseCluster hbaseCluster = null;
   private MiniMRCluster mrCluster = null;
   // If non-null, then already a cluster running.
@@ -113,6 +114,10 @@ public class HBaseTestingUtility {
     this.conf = conf;
   }
 
+  public MiniHBaseCluster getHbaseCluster() {
+    return hbaseCluster;
+  }
+
   /**
    * Returns this classes's instance of {@link Configuration}.  Be careful how
    * you use the returned Configuration since {@link HConnection} instances

Modified: hbase/trunk/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java
URL: http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java?rev=1099566&r1=1099565&r2=1099566&view=diff
==============================================================================
--- hbase/trunk/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java (original)
+++ hbase/trunk/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java Wed May  4 19:28:45 2011
@@ -19,23 +19,28 @@
  */
 package org.apache.hadoop.hbase.util;
 
+import static org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
+import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hbase.HBaseTestingUtility;
-import org.apache.hadoop.hbase.HConstants;
-import org.apache.hadoop.hbase.HTableDescriptor;
-import org.apache.hadoop.hbase.ServerName;
-import org.apache.hadoop.hbase.client.HTable;
-import org.apache.hadoop.hbase.client.Put;
-import org.apache.hadoop.hbase.client.Result;
-import org.apache.hadoop.hbase.client.ResultScanner;
-import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.*;
+import org.apache.hadoop.hbase.client.*;
+import org.apache.hadoop.hbase.ipc.HRegionInterface;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+
 public class TestHBaseFsck {
 
   final Log LOG = LogFactory.getLog(getClass());
@@ -50,32 +55,39 @@ public class TestHBaseFsck {
     TEST_UTIL.startMiniCluster(3);
   }
 
-  private int doFsck(boolean fix) throws Exception {
+  private List doFsck(boolean fix) throws Exception {
     HBaseFsck fsck = new HBaseFsck(conf);
-    fsck.displayFullReport();
+    fsck.displayFullReport();  // i.e. -details
     fsck.setTimeLag(0);
     fsck.setFixErrors(fix);
-    // Most basic check ever, 0 tables
-    return fsck.doWork();
+    fsck.doWork();
+    return fsck.getErrors().getErrorList();
+  }
+
+  private void assertNoErrors(List errs) throws Exception {
+    assertEquals(0, errs.size());
+  }
+
+  private void assertErrors(List errs, ERROR_CODE[] expectedErrors) {
+    assertEquals(Arrays.asList(expectedErrors), errs);
   }
 
   @Test
   public void testHBaseFsck() throws Exception {
-    int result = doFsck(false);
-    assertEquals(0, result);
+    assertNoErrors(doFsck(false));
 
     TEST_UTIL.createTable(TABLE, FAM);
 
     // We created 1 table, should be fine
-    result = doFsck(false);
-    assertEquals(0, result);
+    assertNoErrors(doFsck(false));
 
     // Now let's mess it up and change the assignment in .META. to
     // point to a different region server
     HTable meta = new HTable(conf, HTableDescriptor.META_TABLEDESC.getName());
     ResultScanner scanner = meta.getScanner(new Scan());
 
-    resforloop : for (Result res : scanner) {
+    resforloop:
+    for (Result res : scanner) {
       long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
           HConstants.STARTCODE_QUALIFIER));
 
@@ -98,13 +110,90 @@ public class TestHBaseFsck {
     }
 
     // Try to fix the data
-    result = doFsck(true);
-    assertEquals(-1, result);
-
+    assertErrors(doFsck(true), new ERROR_CODE[]{
+        ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
     Thread.sleep(15000);
-    result = doFsck(false);
-    // Should have fixed
-    assertEquals(0, result);
+
+    // Should be fixed now
+    assertNoErrors(doFsck(false));
+
+    // comment needed - what is the purpose of this line
     new HTable(conf, TABLE).getScanner(new Scan());
   }
+
+  private HRegionInfo createRegion(Configuration conf, final HTableDescriptor
+      htd, byte[] startKey, byte[] endKey)
+      throws IOException {
+    HTable meta = new HTable(conf, HConstants.META_TABLE_NAME);
+    HRegionInfo hri = new HRegionInfo(htd, startKey, endKey);
+    Put put = new Put(hri.getRegionName());
+    put.add(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER,
+        Writables.getBytes(hri));
+    meta.put(put);
+    return hri;
+  }
+
+  @Test
+  /**
+   * Tests for inconsistencies in the META data (duplicate start keys, or holes)
+   */
+  public void testHBaseFsckMeta() throws Exception {
+    assertNoErrors(doFsck(false));
+
+    HTable tbl = TEST_UTIL.createTable(Bytes.toBytes("table2"), FAM);
+
+    HRegionInfo hriOrig = tbl.getRegionsInfo().keySet().iterator().next();
+    HServerAddress rsAddressOrig = tbl.getRegionsInfo().get(hriOrig);
+
+    byte[][] startKeys = new byte[][]{
+        HConstants.EMPTY_BYTE_ARRAY,
+        Bytes.toBytes("A"),
+        Bytes.toBytes("B"),
+        Bytes.toBytes("C")
+    };
+    TEST_UTIL.createMultiRegions(conf, tbl, FAM, startKeys);
+    Path rootDir = new Path(conf.get(HConstants.HBASE_DIR));
+    FileSystem fs = rootDir.getFileSystem(conf);
+    Path p = new Path(rootDir + "/table2", hriOrig.getEncodedName());
+    fs.delete(p, true);
+
+    Thread.sleep(1 * 1000);
+    ArrayList servers = new ArrayList();
+    servers.add(rsAddressOrig);
+    HBaseFsckRepair.fixDupeAssignment(conf, hriOrig, servers);
+
+    // We created 1 table, should be fine
+    assertNoErrors(doFsck(false));
+
+    // Now let's mess it up, by adding a region with a duplicate startkey
+    HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
+        Bytes.toBytes("A"), Bytes.toBytes("A2"));
+    TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
+    TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
+        .waitForAssignment(hriDupe);
+    assertErrors(doFsck(false), new ERROR_CODE[]{ERROR_CODE.DUPE_STARTKEYS});
+
+    // Mess it up by creating an overlap in the metadata
+    HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
+        Bytes.toBytes("A2"), Bytes.toBytes("B2"));
+    TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
+    TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
+        .waitForAssignment(hriOverlap);
+    assertErrors(doFsck(false), new ERROR_CODE[]{
+        ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
+        ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
+
+    // Mess it up by leaving a hole in the meta data
+    HRegionInfo hriHole = createRegion(conf, tbl.getTableDescriptor(),
+        Bytes.toBytes("D"), Bytes.toBytes("E"));
+    TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriHole);
+    TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
+        .waitForAssignment(hriHole);
+//    assertError(doFsck(false), ERROR_CODE.OVERLAP_IN_REGION_CHAIN);
+    assertErrors(doFsck(false), new ERROR_CODE[]{ ERROR_CODE.DUPE_STARTKEYS,
+        ERROR_CODE.OVERLAP_IN_REGION_CHAIN, ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
+        ERROR_CODE.HOLE_IN_REGION_CHAIN });
+
+  }
+
 }