You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ap...@apache.org on 2015/08/22 19:08:47 UTC

[2/4] hbase git commit: HBASE-13996 Add write sniffing in canary (Liu Shaohui)

HBASE-13996 Add write sniffing in canary (Liu Shaohui)

Conflicts:
	hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java

Amending-Author: Andrew Purtell <ap...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/234a4632
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/234a4632
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/234a4632

Branch: refs/heads/branch-1
Commit: 234a4632a44dd5d8c70209a2dee70f71147a4a24
Parents: f05770b
Author: Andrew Purtell <ap...@apache.org>
Authored: Fri Aug 21 22:44:56 2015 -0700
Committer: Andrew Purtell <ap...@apache.org>
Committed: Fri Aug 21 22:57:28 2015 -0700

----------------------------------------------------------------------
 .../org/apache/hadoop/hbase/HConstants.java     |  29 ++-
 .../hbase/tmpl/master/MasterStatusTmpl.jamon    |  10 +-
 .../org/apache/hadoop/hbase/tool/Canary.java    | 255 +++++++++++++++++--
 src/main/asciidoc/_chapters/ops_mgt.adoc        |  21 ++
 4 files changed, 285 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/234a4632/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
----------------------------------------------------------------------
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
index 5fafda0..067a4dd 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java
@@ -805,7 +805,8 @@ public final class HConstants {
   /**
    * timeout for short operation RPC
    */
-  public static final String HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY = "hbase.rpc.shortoperation.timeout";
+  public static final String HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY =
+      "hbase.rpc.shortoperation.timeout";
 
   /**
    * Default value of {@link #HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY}
@@ -860,8 +861,8 @@ public final class HConstants {
     */
   public static final float HBASE_CLUSTER_MINIMUM_MEMORY_THRESHOLD = 0.2f;
 
-  public static final Pattern CP_HTD_ATTR_KEY_PATTERN = Pattern.compile
-      ("^coprocessor\\$([0-9]+)$", Pattern.CASE_INSENSITIVE);
+  public static final Pattern CP_HTD_ATTR_KEY_PATTERN =
+      Pattern.compile("^coprocessor\\$([0-9]+)$", Pattern.CASE_INSENSITIVE);
   public static final Pattern CP_HTD_ATTR_VALUE_PATTERN =
       Pattern.compile("(^[^\\|]*)\\|([^\\|]+)\\|[\\s]*([\\d]*)[\\s]*(\\|.*)?$");
 
@@ -912,7 +913,7 @@ public final class HConstants {
    * 1   => Abort only all of the handers have died
    */
   public static final String REGION_SERVER_HANDLER_ABORT_ON_ERROR_PERCENT =
-		  "hbase.regionserver.handler.abort.on.error.percent";
+      "hbase.regionserver.handler.abort.on.error.percent";
   public static final double DEFAULT_REGION_SERVER_HANDLER_ABORT_ON_ERROR_PERCENT = 0.5;
 
   //High priority handlers to deal with admin requests and system table operation requests
@@ -972,7 +973,8 @@ public final class HConstants {
   public static final String DEFAULT_WAL_STORAGE_POLICY = "NONE";
 
   /** Region in Transition metrics threshold time */
-  public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD="hbase.metrics.rit.stuck.warning.threshold";
+  public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD =
+      "hbase.metrics.rit.stuck.warning.threshold";
 
   public static final String LOAD_BALANCER_SLOP_KEY = "hbase.regions.slop";
 
@@ -1068,7 +1070,8 @@ public final class HConstants {
    * 0.0.0.0.
    * @see <a href="https://issues.apache.org/jira/browse/HBASE-9961">HBASE-9961</a>
    */
-  public static final String STATUS_MULTICAST_BIND_ADDRESS = "hbase.status.multicast.bind.address.ip";
+  public static final String STATUS_MULTICAST_BIND_ADDRESS =
+      "hbase.status.multicast.bind.address.ip";
   public static final String DEFAULT_STATUS_MULTICAST_BIND_ADDRESS = "0.0.0.0";
 
   /**
@@ -1197,6 +1200,20 @@ public final class HConstants {
   public static final String REGION_SPLIT_THREADS_MAX =
     "hbase.regionserver.region.split.threads.max";
 
+  /** Canary config keys */
+  public static final String HBASE_CANARY_WRITE_DATA_TTL_KEY = "hbase.canary.write.data.ttl";
+
+  public static final String HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY =
+      "hbase.canary.write.perserver.regions.lowerLimit";
+
+  public static final String HBASE_CANARY_WRITE_PERSERVER_REGIONS_UPPERLIMIT_KEY =
+      "hbase.canary.write.perserver.regions.upperLimit";
+
+  public static final String HBASE_CANARY_WRITE_VALUE_SIZE_KEY = "hbase.canary.write.value.size";
+
+  public static final String HBASE_CANARY_WRITE_TABLE_CHECK_PERIOD_KEY =
+      "hbase.canary.write.table.check.period";
+
   private HConstants() {
     // Can't be instantiated with this ctor.
   }

http://git-wip-us.apache.org/repos/asf/hbase/blob/234a4632/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon
index caa3535..c0b70a2 100644
--- a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon
+++ b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon
@@ -48,6 +48,7 @@ org.apache.hadoop.hbase.master.RegionState;
 org.apache.hadoop.hbase.HTableDescriptor;
 org.apache.hadoop.hbase.HBaseConfiguration;
 org.apache.hadoop.hbase.TableName;
+org.apache.hadoop.hbase.tool.Canary;
 org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
 org.apache.hadoop.hbase.master.DeadServer;
 org.apache.hadoop.hbase.protobuf.ProtobufUtil;
@@ -366,11 +367,14 @@ AssignmentManager assignmentManager = master.getAssignmentManager();
     </%if>
     <%java>String description = null;
         if (tableName.equals(TableName.META_TABLE_NAME)){
-            description = "The hbase:meta table holds references to all User Table regions";
+            description = "The hbase:meta table holds references to all User Table regions.";
+        } else if (tableName.equals(Canary.DEFAULT_WRITE_TABLE_NAME)){
+            description = "The hbase:canary table is used to sniff the write availbility of"
+              + " each regionserver.";
         } else if (tableName.equals(AccessControlLists.ACL_TABLE_NAME)){
             description = "The hbase:acl table holds information about acl";
-	 } else if (tableName.equals(VisibilityConstants.LABELS_TABLE_NAME)){
-	     description = "The hbase:labels table holds information about visibility labels";
+        } else if (tableName.equals(VisibilityConstants.LABELS_TABLE_NAME)){
+            description = "The hbase:labels table holds information about visibility labels.";
         } else if (tableName.equals(TableName.NAMESPACE_TABLE_NAME)){
             description = "The hbase:namespace table holds information about namespaces.";
         } else if (tableName.equals(QuotaUtil.QUOTA_TABLE_NAME)){

http://git-wip-us.apache.org/repos/asf/hbase/blob/234a4632/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
index 2cf34b3..7f5c684 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
@@ -24,6 +24,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
@@ -47,9 +48,11 @@ import org.apache.hadoop.hbase.ChoreService;
 import org.apache.hadoop.hbase.DoNotRetryIOException;
 import org.apache.hadoop.hbase.HBaseConfiguration;
 import org.apache.hadoop.hbase.HColumnDescriptor;
+import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.HRegionInfo;
 import org.apache.hadoop.hbase.HRegionLocation;
 import org.apache.hadoop.hbase.HTableDescriptor;
+import org.apache.hadoop.hbase.NamespaceDescriptor;
 import org.apache.hadoop.hbase.ScheduledChore;
 import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.TableName;
@@ -59,13 +62,17 @@ import org.apache.hadoop.hbase.client.Admin;
 import org.apache.hadoop.hbase.client.Connection;
 import org.apache.hadoop.hbase.client.ConnectionFactory;
 import org.apache.hadoop.hbase.client.Get;
-import org.apache.hadoop.hbase.client.NeedUnmanagedConnectionException;
+import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.client.RegionLocator;
 import org.apache.hadoop.hbase.client.ResultScanner;
 import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hbase.client.Table;
 import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
+import org.apache.hadoop.hbase.tool.Canary.RegionTask.TaskType;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
 import org.apache.hadoop.hbase.util.ReflectionUtils;
+import org.apache.hadoop.hbase.util.RegionSplitter;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 
@@ -86,6 +93,9 @@ public final class Canary implements Tool {
     public void publishReadFailure(HRegionInfo region, Exception e);
     public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception e);
     public void publishReadTiming(HRegionInfo region, HColumnDescriptor column, long msTime);
+    public void publishWriteFailure(HRegionInfo region, Exception e);
+    public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception e);
+    public void publishWriteTiming(HRegionInfo region, HColumnDescriptor column, long msTime);
   }
   // new extended sink for output regionserver mode info
   // do not change the Sink interface directly due to maintaining the API
@@ -113,6 +123,23 @@ public final class Canary implements Tool {
       LOG.info(String.format("read from region %s column family %s in %dms",
                region.getRegionNameAsString(), column.getNameAsString(), msTime));
     }
+
+    @Override
+    public void publishWriteFailure(HRegionInfo region, Exception e) {
+      LOG.error(String.format("write to region %s failed", region.getRegionNameAsString()), e);
+    }
+
+    @Override
+    public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception e) {
+      LOG.error(String.format("write to region %s column family %s failed",
+        region.getRegionNameAsString(), column.getNameAsString()), e);
+    }
+
+    @Override
+    public void publishWriteTiming(HRegionInfo region, HColumnDescriptor column, long msTime) {
+      LOG.info(String.format("write to region %s column family %s in %dms",
+        region.getRegionNameAsString(), column.getNameAsString(), msTime));
+    }
   }
   // a ExtendedSink implementation
   public static class RegionServerStdOutSink extends StdOutSink implements ExtendedSink {
@@ -134,18 +161,34 @@ public final class Canary implements Tool {
    * failure.
    */
   static class RegionTask implements Callable<Void> {
+    public enum TaskType{
+      READ, WRITE
+    }
     private Connection connection;
     private HRegionInfo region;
     private Sink sink;
+    private TaskType taskType;
 
-    RegionTask(Connection connection, HRegionInfo region, Sink sink) {
+    RegionTask(Connection connection, HRegionInfo region, Sink sink, TaskType taskType) {
       this.connection = connection;
       this.region = region;
       this.sink = sink;
+      this.taskType = taskType;
     }
 
     @Override
     public Void call() {
+      switch (taskType) {
+      case READ:
+        return read();
+      case WRITE:
+        return write();
+      default:
+        return read();
+      }
+    }
+
+    public Void read() {
       Table table = null;
       HTableDescriptor tableDesc = null;
       try {
@@ -158,6 +201,7 @@ public final class Canary implements Tool {
           try {
             table.close();
           } catch (IOException ioe) {
+            LOG.error("Close table failed", e);
           }
         }
         return null;
@@ -212,6 +256,44 @@ public final class Canary implements Tool {
       try {
         table.close();
       } catch (IOException e) {
+        LOG.error("Close table failed", e);
+      }
+      return null;
+    }
+
+    /**
+     * Check writes for the canary table
+     * @return
+     */
+    private Void write() {
+      Table table = null;
+      HTableDescriptor tableDesc = null;
+      try {
+        table = connection.getTable(region.getTable());
+        tableDesc = table.getTableDescriptor();
+        byte[] rowToCheck = region.getStartKey();
+        if (rowToCheck.length == 0) {
+          rowToCheck = new byte[]{0x0};
+        }
+        int writeValueSize =
+            connection.getConfiguration().getInt(HConstants.HBASE_CANARY_WRITE_VALUE_SIZE_KEY, 10);
+        for (HColumnDescriptor column : tableDesc.getColumnFamilies()) {
+          Put put = new Put(rowToCheck);
+          byte[] value = new byte[writeValueSize];
+          Bytes.random(value);
+          put.addColumn(column.getName(), HConstants.EMPTY_BYTE_ARRAY, value);
+          try {
+            long startTime = System.currentTimeMillis();
+            table.put(put);
+            long time = System.currentTimeMillis() - startTime;
+            sink.publishWriteTiming(region, column, time);
+          } catch (Exception e) {
+            sink.publishWriteFailure(region, column, e);
+          }
+        }
+        table.close();
+      } catch (IOException e) {
+        sink.publishWriteFailure(region, e);
       }
       return null;
     }
@@ -269,11 +351,12 @@ public final class Canary implements Tool {
         }
         sink.publishReadTiming(tableName.getNameAsString(), serverName, stopWatch.getTime());
       } catch (TableNotFoundException tnfe) {
+        LOG.error("Table may be deleted", tnfe);
         // This is ignored because it doesn't imply that the regionserver is dead
       } catch (TableNotEnabledException tnee) {
         // This is considered a success since we got a response.
         LOG.debug("The targeted table was disabled.  Assuming success.");
-      } catch (DoNotRetryIOException | NeedUnmanagedConnectionException dnrioe) {
+      } catch (DoNotRetryIOException dnrioe) {
         sink.publishReadFailure(tableName.getNameAsString(), serverName);
         LOG.error(dnrioe);
       } catch (IOException e) {
@@ -284,6 +367,7 @@ public final class Canary implements Tool {
           try {
             table.close();
           } catch (IOException e) {/* DO NOTHING */
+            LOG.error("Close table failed", e);
           }
         }
         scan = null;
@@ -302,11 +386,15 @@ public final class Canary implements Tool {
   private static final long DEFAULT_INTERVAL = 6000;
 
   private static final long DEFAULT_TIMEOUT = 600000; // 10 mins
-
   private static final int MAX_THREADS_NUM = 16; // #threads to contact regions
 
   private static final Log LOG = LogFactory.getLog(Canary.class);
 
+  public static final TableName DEFAULT_WRITE_TABLE_NAME = TableName.valueOf(
+    NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, "canary");
+
+  private static final String CANARY_TABLE_FAMILY_NAME = "Test";
+
   private Configuration conf = null;
   private long interval = 0;
   private Sink sink = null;
@@ -315,6 +403,9 @@ public final class Canary implements Tool {
   private long timeout = DEFAULT_TIMEOUT;
   private boolean failOnError = true;
   private boolean regionServerMode = false;
+  private boolean writeSniffing = false;
+  private TableName writeTableName = DEFAULT_WRITE_TABLE_NAME;
+
   private ExecutorService executor; // threads to retrieve data from regionservers
 
   public Canary() {
@@ -336,11 +427,8 @@ public final class Canary implements Tool {
     this.conf = conf;
   }
 
-  @Override
-  public int run(String[] args) throws Exception {
+  private int parseArgs(String[] args) {
     int index = -1;
-    ChoreService choreService = null;
-
     // Process command line args
     for (int i = 0; i < args.length; i++) {
       String cmd = args[i];
@@ -375,6 +463,8 @@ public final class Canary implements Tool {
           }
         } else if(cmd.equals("-regionserver")) {
           this.regionServerMode = true;
+        } else if(cmd.equals("-writeSniffing")) {
+          this.writeSniffing = true;
         } else if (cmd.equals("-e")) {
           this.useRegExp = true;
         } else if (cmd.equals("-t")) {
@@ -391,7 +481,14 @@ public final class Canary implements Tool {
             System.err.println("-t needs a numeric value argument.");
             printUsageAndExit();
           }
+        } else if (cmd.equals("-writeTable")) {
+          i++;
 
+          if (i == args.length) {
+            System.err.println("-writeTable needs a string value argument.");
+            printUsageAndExit();
+          }
+          this.writeTableName = TableName.valueOf(args[i]);
         } else if (cmd.equals("-f")) {
           i++;
 
@@ -412,6 +509,13 @@ public final class Canary implements Tool {
         index = i;
       }
     }
+    return index;
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+    int index = parseArgs(args);
+    ChoreService choreService = null;
 
     // Launches chore for refreshing kerberos credentials if security is enabled.
     // Please see http://hbase.apache.org/book.html#_running_canary_in_a_kerberos_enabled_cluster
@@ -497,6 +601,9 @@ public final class Canary implements Tool {
     System.err.println("   -f <B>         stop whole program if first error occurs," +
         " default is true");
     System.err.println("   -t <N>         timeout for a check, default is 600000 (milisecs)");
+    System.err.println("   -writeSniffing enable the write sniffing in canary");
+    System.err.println("   -writeTable    The table used for write sniffing."
+        + " Default is hbase:canary");
     System.exit(USAGE_EXIT_CODE);
   }
 
@@ -523,7 +630,8 @@ public final class Canary implements Tool {
               (ExtendedSink) this.sink, this.executor);
     } else {
       monitor =
-          new RegionMonitor(connection, monitorTargets, this.useRegExp, this.sink, this.executor);
+          new RegionMonitor(connection, monitorTargets, this.useRegExp, this.sink, this.executor,
+              this.writeSniffing, this.writeTableName);
     }
     return monitor;
   }
@@ -586,10 +694,34 @@ public final class Canary implements Tool {
 
   // a monitor for region mode
   private static class RegionMonitor extends Monitor {
+    // 10 minutes
+    private static final int DEFAULT_WRITE_TABLE_CHECK_PERIOD = 10 * 60 * 1000;
+    // 1 days
+    private static final int DEFAULT_WRITE_DATA_TTL = 24 * 60 * 60;
+
+    private long lastCheckTime = -1;
+    private boolean writeSniffing;
+    private TableName writeTableName;
+    private int writeDataTTL;
+    private float regionsLowerLimit;
+    private float regionsUpperLimit;
+    private int checkPeriod;
 
     public RegionMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
-        Sink sink, ExecutorService executor) {
+        Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName) {
       super(connection, monitorTargets, useRegExp, sink, executor);
+      Configuration conf = connection.getConfiguration();
+      this.writeSniffing = writeSniffing;
+      this.writeTableName = writeTableName;
+      this.writeDataTTL =
+          conf.getInt(HConstants.HBASE_CANARY_WRITE_DATA_TTL_KEY, DEFAULT_WRITE_DATA_TTL);
+      this.regionsLowerLimit =
+          conf.getFloat(HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY, 1.0f);
+      this.regionsUpperLimit =
+          conf.getFloat(HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_UPPERLIMIT_KEY, 1.5f);
+      this.checkPeriod =
+          conf.getInt(HConstants.HBASE_CANARY_WRITE_TABLE_CHECK_PERIOD_KEY,
+            DEFAULT_WRITE_TABLE_CHECK_PERIOD);
     }
 
     @Override
@@ -601,11 +733,26 @@ public final class Canary implements Tool {
             String[] tables = generateMonitorTables(this.targets);
             this.initialized = true;
             for (String table : tables) {
-              taskFutures.addAll(Canary.sniff(admin, sink, table, executor));
+              taskFutures.addAll(Canary.sniff(admin, sink, table, executor, TaskType.READ));
             }
           } else {
-            taskFutures.addAll(sniff());
+            taskFutures.addAll(sniff(TaskType.READ));
+          }
+
+          if (writeSniffing) {
+            if (EnvironmentEdgeManager.currentTime() - lastCheckTime > checkPeriod) {
+              try {
+                checkWriteTableDistribution();
+              } catch (IOException e) {
+                LOG.error("Check canary table distribution failed!", e);
+              }
+              lastCheckTime = EnvironmentEdgeManager.currentTime();
+            }
+            // sniff canary table with write operation
+            taskFutures.addAll(Canary.sniff(admin, sink,
+              admin.getTableDescriptor(writeTableName), executor, TaskType.WRITE));
           }
+
           for (Future<Void> future : taskFutures) {
             try {
               future.get();
@@ -661,25 +808,91 @@ public final class Canary implements Tool {
     /*
      * canary entry point to monitor all the tables.
      */
-    private List<Future<Void>> sniff() throws Exception {
+    private List<Future<Void>> sniff(TaskType taskType) throws Exception {
       List<Future<Void>> taskFutures = new LinkedList<Future<Void>>();
       for (HTableDescriptor table : admin.listTables()) {
-        if (admin.isTableEnabled(table.getTableName())) {
-          taskFutures.addAll(Canary.sniff(admin, sink, table, executor));
+        if (admin.isTableEnabled(table.getTableName())
+            && (!table.getTableName().equals(writeTableName))) {
+          taskFutures.addAll(Canary.sniff(admin, sink, table, executor, taskType));
         }
       }
       return taskFutures;
     }
+
+    private void checkWriteTableDistribution() throws IOException {
+      if (!admin.tableExists(writeTableName)) {
+        int numberOfServers = admin.getClusterStatus().getServers().size();
+        if (numberOfServers == 0) {
+          throw new IllegalStateException("No live regionservers");
+        }
+        createWriteTable(numberOfServers);
+      }
+
+      if (!admin.isTableEnabled(writeTableName)) {
+        admin.enableTable(writeTableName);
+      }
+
+      int numberOfServers = admin.getClusterStatus().getServers().size();
+      List<HRegionLocation> locations;
+      RegionLocator locator = connection.getRegionLocator(writeTableName);
+      try {
+        locations = locator.getAllRegionLocations();
+      } finally {
+        locator.close();
+      }
+      int numberOfRegions = locations.size();
+      if (numberOfRegions < numberOfServers * regionsLowerLimit
+          || numberOfRegions > numberOfServers * regionsUpperLimit) {
+        admin.disableTable(writeTableName);
+        admin.deleteTable(writeTableName);
+        createWriteTable(numberOfServers);
+      }
+      HashSet<ServerName> serverSet = new HashSet<ServerName>();
+      for (HRegionLocation location: locations) {
+        serverSet.add(location.getServerName());
+      }
+      int numberOfCoveredServers = serverSet.size();
+      if (numberOfCoveredServers < numberOfServers) {
+        admin.balancer();
+      }
+    }
+
+    private void createWriteTable(int numberOfServers) throws IOException {
+      int numberOfRegions = (int)(numberOfServers * regionsLowerLimit);
+      LOG.info("Number of live regionservers: " + numberOfServers + ", "
+          + "pre-splitting the canary table into " + numberOfRegions + " regions "
+          + "(current  lower limi of regions per server is " + regionsLowerLimit
+          + " and you can change it by config: "
+          + HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY + " )");
+      HTableDescriptor desc = new HTableDescriptor(writeTableName);
+      HColumnDescriptor family = new HColumnDescriptor(CANARY_TABLE_FAMILY_NAME);
+      family.setMaxVersions(1);
+      family.setTimeToLive(writeDataTTL);
+
+      desc.addFamily(family);
+      byte[][] splits = new RegionSplitter.HexStringSplit().split(numberOfRegions);
+      admin.createTable(desc, splits);
+    }
   }
 
   /**
    * Canary entry point for specified table.
    * @throws Exception
    */
-  public static void sniff(final Admin admin, TableName tableName) throws Exception {
+  public static void sniff(final Admin admin, TableName tableName)
+      throws Exception {
+    sniff(admin, tableName, TaskType.READ);
+  }
+
+  /**
+   * Canary entry point for specified table with task type(read/write)
+   * @throws Exception
+   */
+  public static void sniff(final Admin admin, TableName tableName, TaskType taskType)
+      throws Exception {
     List<Future<Void>> taskFutures =
         Canary.sniff(admin, new StdOutSink(), tableName.getNameAsString(),
-          new ScheduledThreadPoolExecutor(1));
+          new ScheduledThreadPoolExecutor(1), taskType);
     for (Future<Void> future : taskFutures) {
       future.get();
     }
@@ -690,10 +903,10 @@ public final class Canary implements Tool {
    * @throws Exception
    */
   private static List<Future<Void>> sniff(final Admin admin, final Sink sink, String tableName,
-      ExecutorService executor) throws Exception {
+      ExecutorService executor, TaskType taskType) throws Exception {
     if (admin.isTableEnabled(TableName.valueOf(tableName))) {
       return Canary.sniff(admin, sink, admin.getTableDescriptor(TableName.valueOf(tableName)),
-        executor);
+        executor, taskType);
     } else {
       LOG.warn(String.format("Table %s is not enabled", tableName));
     }
@@ -704,7 +917,7 @@ public final class Canary implements Tool {
    * Loops over regions that owns this table, and output some information abouts the state.
    */
   private static List<Future<Void>> sniff(final Admin admin, final Sink sink,
-      HTableDescriptor tableDesc, ExecutorService executor) throws Exception {
+      HTableDescriptor tableDesc, ExecutorService executor, TaskType taskType) throws Exception {
     Table table = null;
     try {
       table = admin.getConnection().getTable(tableDesc.getTableName());
@@ -714,7 +927,7 @@ public final class Canary implements Tool {
     List<RegionTask> tasks = new ArrayList<RegionTask>();
     try {
       for (HRegionInfo region : admin.getTableRegions(tableDesc.getTableName())) {
-        tasks.add(new RegionTask(admin.getConnection(), region, sink));
+        tasks.add(new RegionTask(admin.getConnection(), region, sink, taskType));
       }
     } finally {
       table.close();

http://git-wip-us.apache.org/repos/asf/hbase/blob/234a4632/src/main/asciidoc/_chapters/ops_mgt.adoc
----------------------------------------------------------------------
diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc
index b8018b6..a4dbccb 100644
--- a/src/main/asciidoc/_chapters/ops_mgt.adoc
+++ b/src/main/asciidoc/_chapters/ops_mgt.adoc
@@ -92,6 +92,8 @@ Usage: bin/hbase org.apache.hadoop.hbase.tool.Canary [opts] [table1 [table2]...]
       which means the region/regionserver is regular expression pattern
    -f <B>         stop whole program if first error occurs, default is true
    -t <N>         timeout for a check, default is 600000 (milliseconds)
+   -writeSniffing enable the write sniffing in canary
+   -writeTable    The table used for write sniffing. Default is hbase:canary
 ----
 
 This tool will return non zero error codes to user for collaborating with other monitoring tools, such as Nagios.
@@ -193,6 +195,25 @@ This run sets the timeout value to 60 seconds, the default value is 600 seconds.
 $ ${HBASE_HOME}/bin/hbase org.apache.hadoop.hbase.tool.Canary -t 600000
 ----
 
+==== Enable write sniffing in canary
+
+By default, the canary tool only check the read operations, it's hard to find the problem in the
+write path. To enable the write sniffing, you can run canary with the `-writeSniffing` option.
+When the write sniffing is enabled, the canary tool will create a hbase table and make sure the
+regions of the table distributed on all region servers. In each sniffing period, the canary will
+try to put data to these regions to check the write availability of each region server.
+----
+$ ${HBASE_HOME}/bin/hbase org.apache.hadoop.hbase.tool.Canary -writeSniffing
+----
+
+The default write table is `hbase:canary` and can be specified by the option `-writeTable`.
+----
+$ ${HBASE_HOME}/bin/hbase org.apache.hadoop.hbase.tool.Canary -writeSniffing -writeTable ns:canary
+----
+
+The default value size of each put is 10 bytes and you can set it by the config key:
+`hbase.canary.write.value.size`.
+
 ==== Running Canary in a Kerberos-enabled Cluster
 
 To run Canary in a Kerberos-enabled cluster, configure the following two properties in _hbase-site.xml_: