You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ns...@apache.org on 2011/10/11 04:14:06 UTC

svn commit: r1181501 - in /hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase: HServerAddress.java client/HBaseFsck.java

Author: nspiegelberg
Date: Tue Oct 11 02:14:05 2011
New Revision: 1181501

URL: http://svn.apache.org/viewvc?rev=1181501&view=rev
Log:
Fix HBCK Edge Case Issues

Summary:
Fixed a number of miscellaneous HBCK issues & made
enhancements.
 1) HServerAddress could have a.equals(b) && a.hashCode() !=
 b.hashCode(). This is a Java no-no
 2) Refactored CLI for extensibility
 3) Added '-timeout' for AppOps to control max HBCK time
 4) Issue only warnings if we cannot get info from an RS
 5) Run RS & HDFS scans in parallel

Test Plan:
- bin/hbase hbck

Reviewed By: kannan
Reviewers: jgray, dhruba, kannan, kranganathan
Commenters: dhruba
CC: nspiegelberg, mwelty, dhruba, achao, kannan, gsmyth, hbase@lists
Tasks:
#510832: Reduce HBCK Latency

Revert Plan:
OK

Differential Revision: 222607

Modified:
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/HServerAddress.java
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsck.java

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/HServerAddress.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/HServerAddress.java?rev=1181501&r1=1181500&r2=1181501&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/HServerAddress.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/HServerAddress.java Tue Oct 11 02:14:05 2011
@@ -147,7 +147,6 @@ public class HServerAddress implements W
   @Override
   public int hashCode() {
     int result = address.hashCode();
-    result ^= stringValue.hashCode();
     return result;
   }
 

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsck.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsck.java?rev=1181501&r1=1181500&r2=1181501&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsck.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/client/HBaseFsck.java Tue Oct 11 02:14:05 2011
@@ -33,6 +33,13 @@ import java.util.concurrent.ThreadPoolEx
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.LinkedBlockingQueue;
 
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.PatternOptionBuilder;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -58,6 +65,7 @@ import org.apache.hadoop.util.StringUtil
 
 import com.google.common.base.Joiner;
 import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
 
 /**
  * Check consistency among the in-memory states of the master and the
@@ -74,6 +82,7 @@ public class HBaseFsck {
   private HConnection connection;
   private TreeMap<String, HbckInfo> regionInfo = new TreeMap<String, HbckInfo>();
   private TreeMap<String, TInfo> tablesInfo = new TreeMap<String, TInfo>();
+  private Set<HServerAddress> couldNotScan = Sets.newHashSet();
   ErrorReporter errors = new PrintingErrorReporter();
 
   private static boolean details = false; // do we display the full report
@@ -88,6 +97,7 @@ public class HBaseFsck {
   private int numThreads = MAX_NUM_THREADS;
 
   ThreadPoolExecutor executor;         // threads to retrieve data from regionservers
+  private List<WorkItem> asyncWork = Lists.newArrayList();
 
   /**
    * Constructor
@@ -180,10 +190,13 @@ public class HBaseFsck {
     }
 
     // Determine what's deployed
-    processRegionServers(regionServers);
+    scanRegionServers(regionServers);
 
     // Determine what's on HDFS
-    checkHdfs();
+    scanHdfs();
+
+    // finish all async tasks before analyzing what we have
+    finishAsyncWork();
 
     // Check consistency
     checkConsistency();
@@ -201,7 +214,7 @@ public class HBaseFsck {
    * Scan HDFS for all regions, recording their information into
    * regionInfo
    */
-  void checkHdfs() throws IOException, InterruptedException {
+  void scanHdfs() throws IOException, InterruptedException {
     Path rootDir = new Path(conf.get(HConstants.HBASE_DIR));
     FileSystem fs = rootDir.getFileSystem(conf);
 
@@ -222,23 +235,12 @@ public class HBaseFsck {
     if (!foundVersionFile) {
       errors.reportError("Version file does not exist in root dir " + rootDir);
     }
-    //
-    // level 1:  <HBASE_DIR>/*
-    WorkItemHdfsDir[] dirs = new WorkItemHdfsDir[tableDirs.size()];
-    int num = 0;
-    for (FileStatus tableDir : tableDirs) {
-      dirs[num] = new WorkItemHdfsDir(this, fs, errors, tableDir);
-      executor.execute(dirs[num]);
-      num++;
-    }
 
-    // wait for all directories to be done
-    for (int i = 0; i < num; i++) {
-      synchronized (dirs[i]) {
-        while (!dirs[i].isDone()) {
-          dirs[i].wait();
-        }
-      }
+    // scan all the HDFS directories in parallel
+    for (FileStatus tableDir : tableDirs) {
+      WorkItem work = new WorkItemHdfsDir(this, fs, errors, tableDir);
+      executor.execute(work);
+      asyncWork.add(work);
     }
   }
 
@@ -271,27 +273,27 @@ public class HBaseFsck {
    * @param regionServerList - the list of region servers to connect to
    * @throws IOException if a remote or network exception occurs
    */
-  void processRegionServers(Collection<HServerInfo> regionServerList)
+  void scanRegionServers(Collection<HServerInfo> regionServerList)
     throws IOException, InterruptedException {
 
-    WorkItemRegion[] work = new WorkItemRegion[regionServerList.size()];
-    int num = 0;
-
     // loop to contact each region server in parallel
     for (HServerInfo rsinfo:regionServerList) {
-      work[num] = new WorkItemRegion(this, rsinfo, errors, connection);
-      executor.execute(work[num]);
-      num++;
+      WorkItem work = new WorkItemRegion(this, rsinfo, errors, connection);
+      executor.execute(work);
+      asyncWork.add(work);
     }
+  }
 
-    // wait for all submitted tasks to be done
-    for (int i = 0; i < num; i++) {
-      synchronized (work[i]) {
-        while (!work[i].isDone()) {
-          work[i].wait();
+  void finishAsyncWork() throws InterruptedException {
+    // wait for all directories to be done
+    for (WorkItem work : this.asyncWork) {
+      synchronized (work) {
+        while (!work.isDone()) {
+          work.wait();
         }
       }
     }
+
   }
 
   /**
@@ -359,16 +361,22 @@ public class HBaseFsck {
       errors.reportError("Region " + descriptiveName + " found in META, but not in HDFS, " +
         "and deployed on " + Joiner.on(", ").join(hbi.deployedOn));
     } else if (inMeta && inHdfs && !isDeployed && shouldBeDeployed) {
-      errors.reportWarning("Region " + descriptiveName + " not deployed on any region server.");
-      // If we are trying to fix the errors
-      if (fix == FixState.ALL) {
-        errors.print("Trying to fix unassigned region...");
-        if (HBaseFsckRepair.fixUnassigned(this.conf, hbi.metaEntry)) {
-          setShouldRerun();
+      if (couldNotScan.contains(hbi.metaEntry.regionServer)) {
+        LOG.info("Could not verify region " + descriptiveName
+            + " because could not scan supposed owner "
+            + hbi.metaEntry.regionServer);
+      } else {
+        errors.reportWarning("Region " + descriptiveName + " not deployed on any region server.");
+        // If we are trying to fix the errors
+        if (fix == FixState.ALL) {
+          errors.print("Trying to fix unassigned region...");
+          if (HBaseFsckRepair.fixUnassigned(this.conf, hbi.metaEntry)) {
+            setShouldRerun();
+          }
         }
       }
     } else if (inMeta && inHdfs && isDeployed && !shouldBeDeployed) {
-      errors.reportError("Region " + descriptiveName + " has should not be deployed according " +
+      errors.reportError("Region " + descriptiveName + " should not be deployed according " +
         "to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn));
     } else if (inMeta && inHdfs && isMultiplyDeployed) {
       errors.reportFixableError("Region " + descriptiveName +
@@ -415,7 +423,8 @@ public class HBaseFsck {
       if (hbi.metaEntry == null) continue;
       if (hbi.metaEntry.regionServer == null) continue;
       if (hbi.foundRegionDir == null) continue;
-      if (hbi.deployedOn.size() != 1) continue;
+      if (hbi.deployedOn.isEmpty()
+          && !couldNotScan.contains(hbi.metaEntry.regionServer)) continue;
       if (hbi.onlyEdits) continue;
 
       // We should be safe here
@@ -565,6 +574,10 @@ public class HBaseFsck {
     return uniqueTables.toArray(new HTableDescriptor[uniqueTables.size()]);
   }
 
+  private synchronized boolean addFailedServer(HServerAddress server) {
+    return couldNotScan.add(server);
+  }
+
   /**
    * Gets the entry in regionInfo corresponding to the the given encoded
    * region name. If the region has not been seen yet, a new entry is added
@@ -854,10 +867,14 @@ public class HBaseFsck {
     }
   }
 
+  static interface WorkItem extends Runnable {
+    boolean isDone();
+  }
+
   /**
    * Contact a region server and get all information from it
    */
-  static class WorkItemRegion implements Runnable {
+  static class WorkItemRegion implements WorkItem {
     private HBaseFsck hbck;
     private HServerInfo rsinfo;
     private ErrorReporter errors;
@@ -874,7 +891,7 @@ public class HBaseFsck {
     }
 
     // is this task done?
-    synchronized boolean isDone() {
+    public synchronized boolean isDone() {
       return done;
     }
 
@@ -908,8 +925,9 @@ public class HBaseFsck {
           hbi.addServer(rsinfo.getServerAddress());
         }
       } catch (IOException e) {          // unable to connect to the region server.
-        errors.reportError("RegionServer: " + rsinfo.getServerName() +
-                      " Unable to fetch region information. " + e);
+        errors.reportWarning("RegionServer: " + rsinfo.getServerName()
+          + " Unable to fetch region information. " + e);
+        hbck.addFailedServer(rsinfo.getServerAddress());
       } finally {
         done = true;
         notifyAll(); // wakeup anybody waiting for this item to be done
@@ -920,7 +938,7 @@ public class HBaseFsck {
   /**
    * Contact hdfs and get all information about spcified table directory.
    */
-  static class WorkItemHdfsDir implements Runnable {
+  static class WorkItemHdfsDir implements WorkItem {
     private HBaseFsck hbck;
     private FileStatus tableDir;
     private ErrorReporter errors;
@@ -936,7 +954,7 @@ public class HBaseFsck {
       this.done = false;
     }
 
-    synchronized boolean isDone() {
+    public synchronized boolean isDone() {
       return done;
     }
 
@@ -1046,66 +1064,90 @@ public class HBaseFsck {
     timelag = ms;
   }
 
-  protected static void printUsageAndExit() {
-    System.err.println("Usage: fsck [opts] ");
-    System.err.println(" where [opts] are:");
-    System.err.println("   -details Display full report of all regions.");
-    System.err.println("   -timelag {timeInSeconds}  Process only regions that " +
-                       " have not experienced any metadata updates in the last " +
-                       " {{timeInSeconds} seconds.");
-    System.err.println("   -fix [-w] [-y] Try to fix some of the errors." +
-                       "           -y Do not prompt for reconfirmation from users." +
-                       "           -w Try to fix warnings as well");
-    System.err.println("   -summary Print only summary of the tables and status.");
-    Runtime.getRuntime().exit(-2);
-  }
-
   /**
    * Main program
+   *
    * @param args
+   * @throws ParseException
    */
   public static void main(String [] args)
-    throws IOException, MasterNotRunningException, InterruptedException {
+ throws IOException,
+      MasterNotRunningException, InterruptedException, ParseException {
+
+    Options opt = new Options();
+    opt.addOption(OptionBuilder.withArgName("property=value").hasArg()
+        .withDescription("Override HBase Configuration Settings").create("D"));
+    opt.addOption(OptionBuilder.withArgName("timeInSeconds").hasArg()
+      .withDescription("Ignore regions with metadata updates in the last {timeInSeconds}.")
+      .withType(PatternOptionBuilder.NUMBER_VALUE).create("timelag"));
+    opt.addOption(OptionBuilder.withArgName("timeInSeconds").hasArg()
+      .withDescription("Stop scan jobs after a fixed time & analyze existing data.")
+      .withType(PatternOptionBuilder.NUMBER_VALUE).create("timeout"));
+    opt.addOption("fix", false, "Try to fix some of the errors.");
+    opt.addOption("y", false, "Do not prompt for reconfirmation from users on fix.");
+    opt.addOption("w", false, "Try to fix warnings as well as errors.");
+    opt.addOption("summary", false, "Print only summary of the tables and status.");
+    opt.addOption("detail", false, "Display full report of all regions.");
+    opt.addOption("h", false, "Display this help");
+    CommandLine cmd = new GnuParser().parse(opt, args);
+
+    // any unknown args or -h
+    if (!cmd.getArgList().isEmpty() || cmd.hasOption("h")) {
+      new HelpFormatter().printHelp("hbck", opt);
+      return;
+    }
 
-    // create a fsck object
     Configuration conf = HBaseConfiguration.create();
     conf.set("fs.defaultFS", conf.get("hbase.rootdir"));
+
+    if (cmd.hasOption("D")) {
+      for (String confOpt : cmd.getOptionValues("D")) {
+        String[] kv = confOpt.split("=", 2);
+        if (kv.length == 2) {
+          conf.set(kv[0], kv[1]);
+          LOG.debug("-D configuration override: " + kv[0] + "=" + kv[1]);
+        } else {
+          throw new ParseException("-D option format invalid: " + confOpt);
+        }
+      }
+    }
+    if (cmd.hasOption("timeout")) {
+      Object timeout = cmd.getParsedOptionValue("timeout");
+      if (timeout instanceof Long) {
+        conf.setLong(HConstants.HBASE_RPC_TIMEOUT_KEY, ((Long) timeout).longValue() * 1000);
+      } else {
+        throw new ParseException("-timeout needs a long value.");
+      }
+    }
+
+    // create a fsck object
     HBaseFsck fsck = new HBaseFsck(conf);
     fsck.setTimeLag(HBaseFsckRepair.getEstimatedFixTime(conf));
 
-    // Process command-line args.
-    for (int i = 0; i < args.length; i++) {
-      String cmd = args[i];
-      if (cmd.equals("-details")) {
-        fsck.displayFullReport();
-      } else if (cmd.equals("-timelag")) {
-        if (i == args.length - 1) {
-          System.err.println("HBaseFsck: -timelag needs a value.");
-          printUsageAndExit();
-        }
-        try {
-          long timelag = Long.parseLong(args[i+1]);
-          fsck.setTimeLag(timelag * 1000);
-        } catch (NumberFormatException e) {
-          System.err.println("-timelag needs a numeric value.");
-          printUsageAndExit();
-        }
-        i++;
-      } else if (cmd.equals("-fix")) {
-        fsck.setFixState(FixState.ERROR);
-      } else if (cmd.equals("-w")) {
-        fsck.setFixState(FixState.ALL);
-      } else if (cmd.equals("-y")) {
-        fsck.setPromptResponse(true);
-      } else if (cmd.equals("-summary")) {
-        fsck.setSummary();
+    if (cmd.hasOption("details")) {
+      fsck.displayFullReport();
+    }
+    if (cmd.hasOption("timelag")) {
+      Object timelag = cmd.getParsedOptionValue("timelag");
+      if (timelag instanceof Long) {
+        fsck.setTimeLag(((Long) timelag).longValue() * 1000);
       } else {
-        String str = "Unknown command line option : " + cmd;
-        LOG.info(str);
-        System.out.println(str);
-        printUsageAndExit();
+        throw new ParseException("-timelag needs a long value.");
       }
     }
+    if (cmd.hasOption("fix")) {
+      fsck.setFixState(FixState.ERROR);
+    }
+    if (cmd.hasOption("w")) {
+      fsck.setFixState(FixState.ALL);
+    }
+    if (cmd.hasOption("y")) {
+      fsck.setPromptResponse(true);
+    }
+    if (cmd.equals("summary")) {
+        fsck.setSummary();
+    }
+
     int code = -1;
     try {
       // do the real work of fsck