You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by bu...@apache.org on 2019/10/10 21:03:09 UTC
[hbase] branch branch-1.4 updated: HBASE-22874 Define a public API
for Canary checking and a non-public tool implementation
This is an automated email from the ASF dual-hosted git repository.
busbey pushed a commit to branch branch-1.4
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-1.4 by this push:
new 19e0ff8 HBASE-22874 Define a public API for Canary checking and a non-public tool implementation
19e0ff8 is described below
commit 19e0ff890a3cb1e5d79a324aa244cc05e2549c98
Author: Rushabh <ru...@salesforce.com>
AuthorDate: Tue Aug 20 12:40:49 2019 -0700
HBASE-22874 Define a public API for Canary checking and a non-public tool implementation
Closes #580
* Canary is now an IA.Public interface
* CanaryTool is now the implementation
Branch-1 specific changes for differences in APIs and cleanup of the ref guide using a classname.
Co-authored-by: Sean Busbey <bu...@apache.org>
Signed-off-by: Sean Busbey <bu...@apache.org>
---
bin/hbase | 2 +-
.../hbase/tmpl/master/MasterStatusTmpl.jamon | 4 +-
.../java/org/apache/hadoop/hbase/tool/Canary.java | 1731 +-------------------
.../hbase/tool/{Canary.java => CanaryTool.java} | 853 +++++-----
.../apache/hadoop/hbase/tool/TestCanaryTool.java | 58 +-
src/main/asciidoc/_chapters/ops_mgt.adoc | 2 +-
6 files changed, 546 insertions(+), 2104 deletions(-)
diff --git a/bin/hbase b/bin/hbase
index 594726a..ddcb92e 100755
--- a/bin/hbase
+++ b/bin/hbase
@@ -400,7 +400,7 @@ elif [ "$COMMAND" = "ltt" ] ; then
CLASS='org.apache.hadoop.hbase.util.LoadTestTool'
HBASE_OPTS="$HBASE_OPTS $HBASE_LTT_OPTS"
elif [ "$COMMAND" = "canary" ] ; then
- CLASS='org.apache.hadoop.hbase.tool.Canary'
+ CLASS='org.apache.hadoop.hbase.tool.CanaryTool'
HBASE_OPTS="$HBASE_OPTS $HBASE_CANARY_OPTS"
elif [ "$COMMAND" = "hbtop" ] ; then
CLASS='org.apache.hadoop.hbase.hbtop.HBTop'
diff --git a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon
index 776b49d..e8821d0 100644
--- a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon
+++ b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon
@@ -48,7 +48,7 @@ org.apache.hadoop.hbase.master.RegionState;
org.apache.hadoop.hbase.HTableDescriptor;
org.apache.hadoop.hbase.HBaseConfiguration;
org.apache.hadoop.hbase.TableName;
-org.apache.hadoop.hbase.tool.Canary;
+org.apache.hadoop.hbase.tool.CanaryTool;
org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
org.apache.hadoop.hbase.master.DeadServer;
org.apache.hadoop.hbase.protobuf.ProtobufUtil;
@@ -404,7 +404,7 @@ AssignmentManager assignmentManager = master.getAssignmentManager();
<%java>String description = null;
if (tableName.equals(TableName.META_TABLE_NAME)){
description = "The hbase:meta table holds references to all User Table regions.";
- } else if (tableName.equals(Canary.DEFAULT_WRITE_TABLE_NAME)){
+ } else if (tableName.equals(CanaryTool.DEFAULT_WRITE_TABLE_NAME)){
description = "The hbase:canary table is used to sniff the write availbility of"
+ " each regionserver.";
} else if (tableName.equals(AccessControlLists.ACL_TABLE_NAME)){
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
index afec6b6..9a93b51 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
@@ -19,1730 +19,51 @@
package org.apache.hadoop.hbase.tool;
-import static org.apache.hadoop.hbase.HConstants.DEFAULT_ZOOKEEPER_ZNODE_PARENT;
-import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_ZNODE_PARENT;
-import com.google.common.collect.Lists;
-
-import java.io.Closeable;
-import java.io.IOException;
-import java.net.InetSocketAddress;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.LinkedList;
-import java.util.List;
import java.util.Map;
-import java.util.Random;
-import java.util.Set;
-import java.util.TreeSet;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Future;
-import java.util.concurrent.ScheduledThreadPoolExecutor;
-import java.util.concurrent.atomic.AtomicLong;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.lang.time.StopWatch;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
+import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hbase.AuthUtil;
-import org.apache.hadoop.hbase.ChoreService;
-import org.apache.hadoop.hbase.ClusterStatus;
-import org.apache.hadoop.hbase.DoNotRetryIOException;
-import org.apache.hadoop.hbase.HBaseConfiguration;
-import org.apache.hadoop.hbase.HColumnDescriptor;
-import org.apache.hadoop.hbase.HConstants;
-import org.apache.hadoop.hbase.HRegionInfo;
-import org.apache.hadoop.hbase.HRegionLocation;
-import org.apache.hadoop.hbase.HTableDescriptor;
-import org.apache.hadoop.hbase.NamespaceDescriptor;
-import org.apache.hadoop.hbase.ScheduledChore;
-import org.apache.hadoop.hbase.ServerName;
-import org.apache.hadoop.hbase.TableName;
-import org.apache.hadoop.hbase.TableNotEnabledException;
-import org.apache.hadoop.hbase.TableNotFoundException;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
-import org.apache.hadoop.hbase.client.Admin;
-import org.apache.hadoop.hbase.client.Connection;
-import org.apache.hadoop.hbase.client.ConnectionFactory;
-import org.apache.hadoop.hbase.client.Get;
-import org.apache.hadoop.hbase.client.Put;
-import org.apache.hadoop.hbase.client.RegionLocator;
-import org.apache.hadoop.hbase.client.ResultScanner;
-import org.apache.hadoop.hbase.client.Scan;
-import org.apache.hadoop.hbase.client.Table;
-import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
-import org.apache.hadoop.hbase.tool.Canary.RegionTask.TaskType;
-import org.apache.hadoop.hbase.util.Bytes;
-import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
-import org.apache.hadoop.hbase.util.ReflectionUtils;
-import org.apache.hadoop.hbase.util.RegionSplitter;
-import org.apache.hadoop.hbase.zookeeper.EmptyWatcher;
-import org.apache.hadoop.hbase.zookeeper.ZKConfig;
-import org.apache.hadoop.util.GenericOptionsParser;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.zookeeper.KeeperException;
-import org.apache.zookeeper.ZooKeeper;
-import org.apache.zookeeper.client.ConnectStringParser;
-import org.apache.zookeeper.data.Stat;
-
-/**
- * HBase Canary Tool, that that can be used to do
- * "canary monitoring" of a running HBase cluster.
- *
- * Here are three modes
- * 1. region mode - Foreach region tries to get one row per column family
- * and outputs some information about failure or latency.
- *
- * 2. regionserver mode - Foreach regionserver tries to get one row from one table
- * selected randomly and outputs some information about failure or latency.
- *
- * 3. zookeeper mode - for each zookeeper instance, selects a zNode and
- * outputs some information about failure or latency.
- */
-@InterfaceAudience.Private
-public final class Canary implements Tool {
- // Sink interface used by the canary to outputs information
- public interface Sink {
- public long getReadFailureCount();
- public long incReadFailureCount();
- public Map<String,String> getReadFailures();
- public void updateReadFailures(String regionName, String serverName);
- public long getWriteFailureCount();
- public long incWriteFailureCount();
- public Map<String,String> getWriteFailures();
- public void updateWriteFailures(String regionName, String serverName);
- public long getReadSuccessCount();
- public long incReadSuccessCount();
- public long getWriteSuccessCount();
- public long incWriteSuccessCount();
- }
-
- // Simple implementation of canary sink that allows to plot on
- // file or standard output timings or failures.
- public static class StdOutSink implements Sink {
- private AtomicLong readFailureCount = new AtomicLong(0),
- writeFailureCount = new AtomicLong(0),
- readSuccessCount = new AtomicLong(0),
- writeSuccessCount = new AtomicLong(0);
-
- private Map<String, String> readFailures = new ConcurrentHashMap<String, String>();
- private Map<String, String> writeFailures = new ConcurrentHashMap<String, String>();
-
- @Override
- public long getReadFailureCount() {
- return readFailureCount.get();
- }
-
- @Override
- public long incReadFailureCount() {
- return readFailureCount.incrementAndGet();
- }
-
- @Override
- public Map<String, String> getReadFailures() {
- return readFailures;
- }
-
- @Override
- public void updateReadFailures(String regionName, String serverName) {
- readFailures.put(regionName, serverName);
- }
-
- @Override
- public long getWriteFailureCount() {
- return writeFailureCount.get();
- }
-
- @Override
- public long incWriteFailureCount() {
- return writeFailureCount.incrementAndGet();
- }
-
- @Override
- public Map<String, String> getWriteFailures() {
- return writeFailures;
- }
-
- @Override
- public void updateWriteFailures(String regionName, String serverName) {
- writeFailures.put(regionName, serverName);
- }
-
- @Override
- public long getReadSuccessCount() {
- return readSuccessCount.get();
- }
-
- @Override
- public long incReadSuccessCount() {
- return readSuccessCount.incrementAndGet();
- }
-
- @Override
- public long getWriteSuccessCount() {
- return writeSuccessCount.get();
- }
-
- @Override
- public long incWriteSuccessCount() {
- return writeSuccessCount.incrementAndGet();
- }
- }
- public static class RegionServerStdOutSink extends StdOutSink {
+@InterfaceAudience.Public
+public interface Canary {
- public void publishReadFailure(String table, String server) {
- incReadFailureCount();
- LOG.error(String.format("Read from table:%s on region server:%s", table, server));
+ public static class Factory {
+ static Canary create(Configuration conf, ExecutorService executor) {
+ return new CanaryTool(conf, executor);
}
- public void publishReadTiming(String table, String server, long msTime) {
- LOG.info(String.format("Read from table:%s on region server:%s in %dms",
- table, server, msTime));
- }
- }
-
- public static class ZookeeperStdOutSink extends StdOutSink {
-
- public void publishReadFailure(String zNode, String server) {
- incReadFailureCount();
- LOG.error(String.format("Read from zNode:%s on zookeeper instance:%s", zNode, server));
- }
-
- public void publishReadTiming(String znode, String server, long msTime) {
- LOG.info(String.format("Read from zNode:%s on zookeeper instance:%s in %dms",
- znode, server, msTime));
- }
- }
-
- public static class RegionStdOutSink extends StdOutSink {
-
- private Map<String, AtomicLong> perTableReadLatency = new HashMap<>();
- private AtomicLong writeLatency = new AtomicLong();
- private Map<String, RegionTaskResult> regionMap = new ConcurrentHashMap<>();
-
- public void publishReadFailure(ServerName serverName, HRegionInfo region, Exception e) {
- incReadFailureCount();
- LOG.error(String.format("read from region %s on regionserver %s failed", region.getRegionNameAsString(), serverName), e);
- }
-
- public void publishReadFailure(ServerName serverName, HRegionInfo region, HColumnDescriptor column, Exception e) {
- incReadFailureCount();
- LOG.error(String.format("read from region %s on regionserver %s column family %s failed",
- region.getRegionNameAsString(), serverName, column.getNameAsString()), e);
- }
-
- public void publishReadTiming(ServerName serverName, HRegionInfo region, HColumnDescriptor column, long msTime) {
- incReadSuccessCount();
- RegionTaskResult res = this.regionMap.get(region.getRegionNameAsString());
- res.setReadSuccess();
- res.setReadLatency(msTime);
- LOG.info(String.format("read from region %s on regionserver %s column family %s in %dms",
- region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime));
- }
-
- public void publishWriteFailure(ServerName serverName, HRegionInfo region, Exception e) {
- incWriteFailureCount();
- LOG.error(String.format("write to region %s on regionserver %s failed", region.getRegionNameAsString(), serverName), e);
- }
-
- public void publishWriteFailure(ServerName serverName, HRegionInfo region, HColumnDescriptor column, Exception e) {
- incWriteFailureCount();
- LOG.error(String.format("write to region %s on regionserver %s column family %s failed",
- region.getRegionNameAsString(), serverName, column.getNameAsString()), e);
- }
-
- public void publishWriteTiming(ServerName serverName, HRegionInfo region, HColumnDescriptor column, long msTime) {
- incWriteSuccessCount();
- RegionTaskResult res = this.regionMap.get(region.getRegionNameAsString());
- res.setWriteSuccess();
- res.setWriteLatency(msTime);
- LOG.info(String.format("write to region %s on regionserver %s column family %s in %dms",
- region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime));
- }
-
- public Map<String, AtomicLong> getReadLatencyMap() {
- return this.perTableReadLatency;
- }
-
- public AtomicLong initializeAndGetReadLatencyForTable(String tableName) {
- AtomicLong initLatency = new AtomicLong(0L);
- this.perTableReadLatency.put(tableName, initLatency);
- return initLatency;
- }
-
- public void initializeWriteLatency() {
- this.writeLatency.set(0L);
- }
-
- public AtomicLong getWriteLatency() {
- return this.writeLatency;
- }
-
- public Map<String, RegionTaskResult> getRegionMap() {
- return this.regionMap;
- }
-
- public int getTotalExpectedRegions() {
- return this.regionMap.size();
- }
- }
-
- static class ZookeeperTask implements Callable<Void> {
- private final Connection connection;
- private final String host;
- private String znode;
- private final int timeout;
- private ZookeeperStdOutSink sink;
-
- public ZookeeperTask(Connection connection, String host, String znode, int timeout,
- ZookeeperStdOutSink sink) {
- this.connection = connection;
- this.host = host;
- this.znode = znode;
- this.timeout = timeout;
- this.sink = sink;
- }
-
- @Override public Void call() throws Exception {
- ZooKeeper zooKeeper = null;
- try {
- zooKeeper = new ZooKeeper(host, timeout, EmptyWatcher.instance);
- Stat exists = zooKeeper.exists(znode, false);
- StopWatch stopwatch = new StopWatch();
- stopwatch.start();
- zooKeeper.getData(znode, false, exists);
- stopwatch.stop();
- sink.publishReadTiming(znode, host, stopwatch.getTime());
- } catch (KeeperException | InterruptedException e) {
- sink.publishReadFailure(znode, host);
- } finally {
- if (zooKeeper != null) {
- zooKeeper.close();
- }
- }
- return null;
- }
- }
-
- /**
- * For each column family of the region tries to get one row and outputs the latency, or the
- * failure.
- */
- static class RegionTask implements Callable<Void> {
- public enum TaskType{
- READ, WRITE
- }
- private Connection connection;
- private HRegionInfo region;
- private RegionStdOutSink sink;
- private TaskType taskType;
- private boolean rawScanEnabled;
- private ServerName serverName;
- private AtomicLong readWriteLatency;
-
- RegionTask(Connection connection, HRegionInfo region, ServerName serverName, RegionStdOutSink sink,
- TaskType taskType, boolean rawScanEnabled, AtomicLong rwLatency) {
- this.connection = connection;
- this.region = region;
- this.serverName = serverName;
- this.sink = sink;
- this.taskType = taskType;
- this.rawScanEnabled = rawScanEnabled;
- this.readWriteLatency = rwLatency;
- }
-
- @Override
- public Void call() {
- switch (taskType) {
- case READ:
- return read();
- case WRITE:
- return write();
- default:
- return read();
- }
- }
-
- public Void read() {
- Table table = null;
- HTableDescriptor tableDesc = null;
- try {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading table descriptor for table %s",
- region.getTable()));
- }
- table = connection.getTable(region.getTable());
- tableDesc = table.getTableDescriptor();
- } catch (IOException e) {
- LOG.debug("sniffRegion failed", e);
- sink.publishReadFailure(serverName, region, e);
- if (table != null) {
- try {
- table.close();
- } catch (IOException ioe) {
- LOG.error("Close table failed", e);
- }
- }
- return null;
- }
-
- byte[] startKey = null;
- Get get = null;
- Scan scan = null;
- ResultScanner rs = null;
- StopWatch stopWatch = new StopWatch();
- for (HColumnDescriptor column : tableDesc.getColumnFamilies()) {
- stopWatch.reset();
- startKey = region.getStartKey();
- // Can't do a get on empty start row so do a Scan of first element if any instead.
- if (startKey.length > 0) {
- get = new Get(startKey);
- get.setCacheBlocks(false);
- get.setFilter(new FirstKeyOnlyFilter());
- get.addFamily(column.getName());
- } else {
- scan = new Scan();
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("rawScan : %s for table: %s", rawScanEnabled,
- tableDesc.getTableName()));
- }
- scan.setRaw(rawScanEnabled);
- scan.setCaching(1);
- scan.setCacheBlocks(false);
- scan.setFilter(new FirstKeyOnlyFilter());
- scan.addFamily(column.getName());
- scan.setMaxResultSize(1L);
- scan.setOneRowLimit();
- }
-
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading from table %s region %s column family %s and key %s",
- tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
- Bytes.toStringBinary(startKey)));
- }
- try {
- stopWatch.start();
- if (startKey.length > 0) {
- table.get(get);
- } else {
- rs = table.getScanner(scan);
- rs.next();
- }
- stopWatch.stop();
- this.readWriteLatency.addAndGet(stopWatch.getTime());
- sink.publishReadTiming(serverName, region, column, stopWatch.getTime());
- } catch (Exception e) {
- sink.publishReadFailure(serverName, region, column, e);
- sink.updateReadFailures(region.getRegionNameAsString(), serverName.getHostname());
- } finally {
- if (rs != null) {
- rs.close();
- }
- scan = null;
- get = null;
- }
- }
- try {
- table.close();
- } catch (IOException e) {
- LOG.error("Close table failed", e);
- }
- return null;
- }
-
- /**
- * Check writes for the canary table
- * @return
- */
- private Void write() {
- Table table = null;
- HTableDescriptor tableDesc = null;
- try {
- table = connection.getTable(region.getTable());
- tableDesc = table.getTableDescriptor();
- byte[] rowToCheck = region.getStartKey();
- if (rowToCheck.length == 0) {
- rowToCheck = new byte[]{0x0};
- }
- int writeValueSize =
- connection.getConfiguration().getInt(HConstants.HBASE_CANARY_WRITE_VALUE_SIZE_KEY, 10);
- for (HColumnDescriptor column : tableDesc.getColumnFamilies()) {
- Put put = new Put(rowToCheck);
- byte[] value = new byte[writeValueSize];
- Bytes.random(value);
- put.addColumn(column.getName(), HConstants.EMPTY_BYTE_ARRAY, value);
-
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("writing to table %s region %s column family %s and key %s",
- tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
- Bytes.toStringBinary(rowToCheck)));
- }
- try {
- long startTime = System.currentTimeMillis();
- table.put(put);
- long time = System.currentTimeMillis() - startTime;
- this.readWriteLatency.addAndGet(time);
- sink.publishWriteTiming(serverName, region, column, time);
- } catch (Exception e) {
- sink.publishWriteFailure(serverName, region, column, e);
- }
- }
- table.close();
- } catch (IOException e) {
- sink.publishWriteFailure(serverName, region, e);
- sink.updateWriteFailures(region.getRegionNameAsString(), serverName.getHostname() );
- }
- return null;
- }
- }
-
- /**
- * Get one row from a region on the regionserver and outputs the latency, or the failure.
- */
- static class RegionServerTask implements Callable<Void> {
- private Connection connection;
- private String serverName;
- private HRegionInfo region;
- private RegionServerStdOutSink sink;
- private AtomicLong successes;
-
- RegionServerTask(Connection connection, String serverName, HRegionInfo region,
- RegionServerStdOutSink sink, AtomicLong successes) {
- this.connection = connection;
- this.serverName = serverName;
- this.region = region;
- this.sink = sink;
- this.successes = successes;
- }
-
- @Override
- public Void call() {
- TableName tableName = null;
- Table table = null;
- Get get = null;
- byte[] startKey = null;
- Scan scan = null;
- StopWatch stopWatch = new StopWatch();
- // monitor one region on every region server
- stopWatch.reset();
- try {
- tableName = region.getTable();
- table = connection.getTable(tableName);
- startKey = region.getStartKey();
- // Can't do a get on empty start row so do a Scan of first element if any instead.
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading from region server %s table %s region %s and key %s",
- serverName, region.getTable(), region.getRegionNameAsString(),
- Bytes.toStringBinary(startKey)));
- }
- if (startKey.length > 0) {
- get = new Get(startKey);
- get.setCacheBlocks(false);
- get.setFilter(new FirstKeyOnlyFilter());
- stopWatch.start();
- table.get(get);
- stopWatch.stop();
- } else {
- scan = new Scan();
- scan.setCacheBlocks(false);
- scan.setFilter(new FirstKeyOnlyFilter());
- scan.setCaching(1);
- scan.setMaxResultSize(1L);
- scan.setOneRowLimit();
- stopWatch.start();
- ResultScanner s = table.getScanner(scan);
- s.next();
- s.close();
- stopWatch.stop();
- }
- successes.incrementAndGet();
- sink.publishReadTiming(tableName.getNameAsString(), serverName, stopWatch.getTime());
- } catch (TableNotFoundException tnfe) {
- LOG.error("Table may be deleted", tnfe);
- // This is ignored because it doesn't imply that the regionserver is dead
- } catch (TableNotEnabledException tnee) {
- // This is considered a success since we got a response.
- successes.incrementAndGet();
- LOG.debug("The targeted table was disabled. Assuming success.");
- } catch (DoNotRetryIOException dnrioe) {
- sink.publishReadFailure(tableName.getNameAsString(), serverName);
- LOG.error(dnrioe);
- } catch (IOException e) {
- sink.publishReadFailure(tableName.getNameAsString(), serverName);
- LOG.error(e);
- } finally {
- if (table != null) {
- try {
- table.close();
- } catch (IOException e) {/* DO NOTHING */
- LOG.error("Close table failed", e);
- }
- }
- scan = null;
- get = null;
- startKey = null;
- }
- return null;
- }
- }
-
- private static final int USAGE_EXIT_CODE = 1;
- private static final int INIT_ERROR_EXIT_CODE = 2;
- private static final int TIMEOUT_ERROR_EXIT_CODE = 3;
- private static final int ERROR_EXIT_CODE = 4;
- private static final int FAILURE_EXIT_CODE = 5;
-
- private static final long DEFAULT_INTERVAL = 60000;
-
- private static final long DEFAULT_TIMEOUT = 600000; // 10 mins
- private static final int MAX_THREADS_NUM = 16; // #threads to contact regions
-
- private static final Log LOG = LogFactory.getLog(Canary.class);
-
- public static final TableName DEFAULT_WRITE_TABLE_NAME = TableName.valueOf(
- NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, "canary");
-
- private static final String CANARY_TABLE_FAMILY_NAME = "Test";
-
- private Configuration conf = null;
- private long interval = 0;
- private Sink sink = null;
-
- private boolean useRegExp;
- private long timeout = DEFAULT_TIMEOUT;
- private boolean failOnError = true;
- private boolean regionServerMode = false;
- private boolean zookeeperMode = false;
- private long permittedFailures = 0;
- private boolean regionServerAllRegions = false;
- private boolean writeSniffing = false;
- private long configuredWriteTableTimeout = DEFAULT_TIMEOUT;
- private boolean treatFailureAsError = false;
- private TableName writeTableName = DEFAULT_WRITE_TABLE_NAME;
- private HashMap<String, Long> configuredReadTableTimeouts = new HashMap<>();
-
- private ExecutorService executor; // threads to retrieve data from regionservers
-
- public Canary() {
- this(new ScheduledThreadPoolExecutor(1), new RegionServerStdOutSink());
- }
-
- public Canary(ExecutorService executor, Sink sink) {
- this.executor = executor;
- this.sink = sink;
- }
-
- @Override
- public Configuration getConf() {
- return conf;
- }
-
- @Override
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- private int parseArgs(String[] args) {
- int index = -1;
- // Process command line args
- for (int i = 0; i < args.length; i++) {
- String cmd = args[i];
-
- if (cmd.startsWith("-")) {
- if (index >= 0) {
- // command line args must be in the form: [opts] [table 1 [table 2 ...]]
- System.err.println("Invalid command line options");
- printUsageAndExit();
- }
-
- if (cmd.equals("-help")) {
- // user asked for help, print the help and quit.
- printUsageAndExit();
- } else if (cmd.equals("-daemon") && interval == 0) {
- // user asked for daemon mode, set a default interval between checks
- interval = DEFAULT_INTERVAL;
- } else if (cmd.equals("-interval")) {
- // user has specified an interval for canary breaths (-interval N)
- i++;
-
- if (i == args.length) {
- System.err.println("-interval needs a numeric value argument.");
- printUsageAndExit();
- }
-
- try {
- interval = Long.parseLong(args[i]) * 1000;
- } catch (NumberFormatException e) {
- System.err.println("-interval needs a numeric value argument.");
- printUsageAndExit();
- }
- } else if (cmd.equals("-zookeeper")) {
- this.zookeeperMode = true;
- } else if(cmd.equals("-regionserver")) {
- this.regionServerMode = true;
- } else if(cmd.equals("-allRegions")) {
- this.regionServerAllRegions = true;
- } else if(cmd.equals("-writeSniffing")) {
- this.writeSniffing = true;
- } else if(cmd.equals("-treatFailureAsError")) {
- this.treatFailureAsError = true;
- } else if (cmd.equals("-e")) {
- this.useRegExp = true;
- } else if (cmd.equals("-t")) {
- i++;
-
- if (i == args.length) {
- System.err.println("-t needs a numeric value argument.");
- printUsageAndExit();
- }
-
- try {
- this.timeout = Long.parseLong(args[i]);
- } catch (NumberFormatException e) {
- System.err.println("-t needs a numeric value argument.");
- printUsageAndExit();
- }
- } else if(cmd.equals("-writeTableTimeout")) {
- i++;
-
- if (i == args.length) {
- System.err.println("-writeTableTimeout needs a numeric value argument.");
- printUsageAndExit();
- }
-
- try {
- this.configuredWriteTableTimeout = Long.parseLong(args[i]);
- } catch (NumberFormatException e) {
- System.err.println("-writeTableTimeout needs a numeric value argument.");
- printUsageAndExit();
- }
- } else if (cmd.equals("-writeTable")) {
- i++;
-
- if (i == args.length) {
- System.err.println("-writeTable needs a string value argument.");
- printUsageAndExit();
- }
- this.writeTableName = TableName.valueOf(args[i]);
- } else if (cmd.equals("-f")) {
- i++;
-
- if (i == args.length) {
- System.err
- .println("-f needs a boolean value argument (true|false).");
- printUsageAndExit();
- }
-
- this.failOnError = Boolean.parseBoolean(args[i]);
- } else if (cmd.equals("-readTableTimeouts")) {
- i++;
-
- if (i == args.length) {
- System.err.println("-readTableTimeouts needs a comma-separated list of read timeouts per table (without spaces).");
- printUsageAndExit();
- }
- String [] tableTimeouts = args[i].split(",");
- for (String tT: tableTimeouts) {
- String [] nameTimeout = tT.split("=");
- if (nameTimeout.length < 2) {
- System.err.println("Each -readTableTimeouts argument must be of the form <tableName>=<read timeout>.");
- printUsageAndExit();
- }
- long timeoutVal = 0L;
- try {
- timeoutVal = Long.parseLong(nameTimeout[1]);
- } catch (NumberFormatException e) {
- System.err.println("-readTableTimeouts read timeout for each table must be a numeric value argument.");
- printUsageAndExit();
- }
- this.configuredReadTableTimeouts.put(nameTimeout[0], timeoutVal);
- }
- } else if (cmd.equals("-permittedZookeeperFailures")) {
- i++;
-
- if (i == args.length) {
- System.err.println("-permittedZookeeperFailures needs a numeric value argument.");
- printUsageAndExit();
- }
- try {
- this.permittedFailures = Long.parseLong(args[i]);
- } catch (NumberFormatException e) {
- System.err.println("-permittedZookeeperFailures needs a numeric value argument.");
- printUsageAndExit();
- }
- } else {
- // no options match
- System.err.println(cmd + " options is invalid.");
- printUsageAndExit();
- }
- } else if (index < 0) {
- // keep track of first table name specified by the user
- index = i;
- }
- }
- if (this.regionServerAllRegions && !this.regionServerMode) {
- System.err.println("-allRegions can only be specified in regionserver mode.");
- printUsageAndExit();
- }
- if (this.zookeeperMode) {
- if (this.regionServerMode || this.regionServerAllRegions || this.writeSniffing) {
- System.err.println("-zookeeper is exclusive and cannot be combined with "
- + "other modes.");
- printUsageAndExit();
- }
- }
- if (this.permittedFailures != 0 && !this.zookeeperMode) {
- System.err.println("-permittedZookeeperFailures requires -zookeeper mode.");
- printUsageAndExit();
- }
- if (!this.configuredReadTableTimeouts.isEmpty() && (this.regionServerMode || this.zookeeperMode)) {
- System.err.println("-readTableTimeouts can only be configured in region mode.");
- printUsageAndExit();
- }
- return index;
- }
-
- @Override
- public int run(String[] args) throws Exception {
- int index = parseArgs(args);
- ChoreService choreService = null;
-
- // Launches chore for refreshing kerberos credentials if security is enabled.
- // Please see http://hbase.apache.org/book.html#_running_canary_in_a_kerberos_enabled_cluster
- // for more details.
- final ScheduledChore authChore = AuthUtil.getAuthChore(conf);
- if (authChore != null) {
- choreService = new ChoreService("CANARY_TOOL");
- choreService.scheduleChore(authChore);
- }
-
- // Start to prepare the stuffs
- Monitor monitor = null;
- Thread monitorThread = null;
- long startTime = 0;
- long currentTimeLength = 0;
- // Get a connection to use in below.
- try (Connection connection = ConnectionFactory.createConnection(this.conf)) {
- do {
- // Do monitor !!
- try {
- monitor = this.newMonitor(connection, index, args);
- monitorThread = new Thread(monitor, "CanaryMonitor-" + System.currentTimeMillis());
- startTime = System.currentTimeMillis();
- monitorThread.start();
- while (!monitor.isDone()) {
- // wait for 1 sec
- Thread.sleep(1000);
- // exit if any error occurs
- if (this.failOnError && monitor.hasError()) {
- monitorThread.interrupt();
- if (monitor.initialized) {
- return monitor.errorCode;
- } else {
- return INIT_ERROR_EXIT_CODE;
- }
- }
- currentTimeLength = System.currentTimeMillis() - startTime;
- if (currentTimeLength > this.timeout) {
- LOG.error("The monitor is running too long (" + currentTimeLength
- + ") after timeout limit:" + this.timeout
- + " will be killed itself !!");
- if (monitor.initialized) {
- return TIMEOUT_ERROR_EXIT_CODE;
- } else {
- return INIT_ERROR_EXIT_CODE;
- }
- }
- }
-
- if (this.failOnError && monitor.finalCheckForErrors()) {
- monitorThread.interrupt();
- return monitor.errorCode;
- }
- } finally {
- if (monitor != null) monitor.close();
- }
-
- Thread.sleep(interval);
- } while (interval > 0);
- } // try-with-resources close
-
- if (choreService != null) {
- choreService.shutdown();
+ @VisibleForTesting
+ static Canary create(Configuration conf, ExecutorService executor, CanaryTool.Sink sink) {
+ return new CanaryTool(conf, executor, sink);
}
- return monitor.errorCode;
- }
-
- public Map<String, String> getReadFailures() {
- return sink.getReadFailures();
- }
-
- public Map<String, String> getWriteFailures() {
- return sink.getWriteFailures();
- }
-
- private void printUsageAndExit() {
- System.err.printf(
- "Usage: bin/hbase %s [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..]%n",
- getClass().getName());
- System.err.println(" where [opts] are:");
- System.err.println(" -help Show this help and exit.");
- System.err.println(" -regionserver replace the table argument to regionserver,");
- System.err.println(" which means to enable regionserver mode");
- System.err.println(" -allRegions Tries all regions on a regionserver,");
- System.err.println(" only works in regionserver mode.");
- System.err.println(" -zookeeper Tries to grab zookeeper.znode.parent ");
- System.err.println(" on each zookeeper instance");
- System.err.println(" -permittedZookeeperFailures <N> Ignore first N failures when attempting to ");
- System.err.println(" connect to individual zookeeper nodes in the ensemble");
- System.err.println(" -daemon Continuous check at defined intervals.");
- System.err.println(" -interval <N> Interval between checks (sec)");
- System.err.println(" -e Use table/regionserver as regular expression");
- System.err.println(" which means the table/regionserver is regular expression pattern");
- System.err.println(" -f <B> stop whole program if first error occurs," +
- " default is true");
- System.err.println(" -t <N> timeout for a check, default is 600000 (millisecs)");
- System.err.println(" -writeTableTimeout <N> write timeout for the writeTable, default is 600000 (millisecs)");
- System.err.println(" -readTableTimeouts <tableName>=<read timeout>,<tableName>=<read timeout>, ... "
- + "comma-separated list of read timeouts per table (no spaces), default is 600000 (millisecs)");
- System.err.println(" -writeSniffing enable the write sniffing in canary");
- System.err.println(" -treatFailureAsError treats read / write failure as error");
- System.err.println(" -writeTable The table used for write sniffing."
- + " Default is hbase:canary");
- System.err.println(" -Dhbase.canary.read.raw.enabled=<true/false> Use this flag to enable or disable raw scan during read canary test"
- + " Default is false and raw is not enabled during scan");
- System.err
- .println(" -D<configProperty>=<value> assigning or override the configuration params");
- System.exit(USAGE_EXIT_CODE);
}
/**
- * Canary region mode-specific data structure which stores information about each region
- * to be scanned
+ * Run Canary in Region mode.
+ *
+ * @param targets -- list of monitor tables.
+ * @return the exit code of the Canary tool.
*/
- public static class RegionTaskResult {
- private HRegionInfo region;
- private TableName tableName;
- private ServerName serverName;
- private AtomicLong readLatency = null;
- private AtomicLong writeLatency = null;
- private boolean readSuccess = false;
- private boolean writeSuccess = false;
-
- public RegionTaskResult(HRegionInfo region, TableName tableName, ServerName serverName) {
- this.region = region;
- this.tableName = tableName;
- this.serverName = serverName;
- }
-
- public HRegionInfo getRegionInfo() {
- return this.region;
- }
-
- public String getRegionNameAsString() {
- return this.region.getRegionNameAsString();
- }
-
- public TableName getTableName() {
- return this.tableName;
- }
-
- public String getTableNameAsString() {
- return this.tableName.getNameAsString();
- }
-
- public ServerName getServerName() {
- return this.serverName;
- }
-
- public String getServerNameAsString() {
- return this.serverName.getServerName();
- }
-
- public long getReadLatency() {
- if (this.readLatency == null) {
- return -1;
- }
- return this.readLatency.get();
- }
-
- public void setReadLatency(long readLatency) {
- if (this.readLatency != null) {
- this.readLatency.set(readLatency);
- } else {
- this.readLatency = new AtomicLong(readLatency);
- }
- }
-
- public long getWriteLatency() {
- if (this.writeLatency == null) {
- return -1;
- }
- return this.writeLatency.get();
- }
-
- public void setWriteLatency(long writeLatency) {
- if (this.writeLatency != null) {
- this.writeLatency.set(writeLatency);
- } else {
- this.writeLatency = new AtomicLong(writeLatency);
- }
- }
-
- public boolean isReadSuccess() {
- return this.readSuccess;
- }
-
- public void setReadSuccess() {
- this.readSuccess = true;
- }
-
- public boolean isWriteSuccess() {
- return this.writeSuccess;
- }
-
- public void setWriteSuccess() {
- this.writeSuccess = true;
- }
- }
+ public int checkRegions(String[] targets) throws Exception;
/**
- * A Factory method for {@link Monitor}.
- * Can be overridden by user.
- * @param index a start index for monitor target
- * @param args args passed from user
- * @return a Monitor instance
+ * Runs Canary in Region server mode.
+ *
+ * @param targets -- list of monitor tables.
+ * @return the exit code of the Canary tool.
*/
- public Monitor newMonitor(final Connection connection, int index, String[] args) {
- Monitor monitor = null;
- String[] monitorTargets = null;
-
- if(index >= 0) {
- int length = args.length - index;
- monitorTargets = new String[length];
- System.arraycopy(args, index, monitorTargets, 0, length);
- }
-
- if (this.sink instanceof RegionServerStdOutSink || this.regionServerMode) {
- monitor =
- new RegionServerMonitor(connection, monitorTargets, this.useRegExp,
- (StdOutSink) this.sink, this.executor, this.regionServerAllRegions,
- this.treatFailureAsError, this.permittedFailures);
- } else if (this.sink instanceof ZookeeperStdOutSink || this.zookeeperMode) {
- monitor =
- new ZookeeperMonitor(connection, monitorTargets, this.useRegExp,
- (StdOutSink) this.sink, this.executor, this.treatFailureAsError,
- this.permittedFailures);
- } else {
- monitor =
- new RegionMonitor(connection, monitorTargets, this.useRegExp,
- (StdOutSink) this.sink, this.executor, this.writeSniffing,
- this.writeTableName, this.treatFailureAsError, this.configuredReadTableTimeouts,
- this.configuredWriteTableTimeout, this.permittedFailures);
- }
- return monitor;
- }
-
- // a Monitor super-class can be extended by users
- public static abstract class Monitor implements Runnable, Closeable {
-
- protected Connection connection;
- protected Admin admin;
- protected String[] targets;
- protected boolean useRegExp;
- protected boolean treatFailureAsError;
- protected boolean initialized = false;
-
- protected boolean done = false;
- protected int errorCode = 0;
- protected long allowedFailures = 0;
- protected Sink sink;
- protected ExecutorService executor;
-
- public boolean isDone() {
- return done;
- }
-
- public boolean hasError() {
- return errorCode != 0;
- }
-
- public boolean finalCheckForErrors() {
- if (errorCode != 0) {
- return true;
- }
- if (treatFailureAsError &&
- (sink.getReadFailureCount() > allowedFailures || sink.getWriteFailureCount() > allowedFailures)) {
- LOG.error("Too many failures detected, treating failure as error, failing the Canary.");
- errorCode = FAILURE_EXIT_CODE;
- return true;
- }
- return false;
- }
-
- @Override
- public void close() throws IOException {
- if (this.admin != null) this.admin.close();
- }
-
- protected Monitor(Connection connection, String[] monitorTargets, boolean useRegExp, Sink sink,
- ExecutorService executor, boolean treatFailureAsError, long allowedFailures) {
- if (null == connection) throw new IllegalArgumentException("connection shall not be null");
-
- this.connection = connection;
- this.targets = monitorTargets;
- this.useRegExp = useRegExp;
- this.treatFailureAsError = treatFailureAsError;
- this.sink = sink;
- this.executor = executor;
- this.allowedFailures = allowedFailures;
- }
-
- @Override
- public abstract void run();
-
- protected boolean initAdmin() {
- if (null == this.admin) {
- try {
- this.admin = this.connection.getAdmin();
- } catch (Exception e) {
- LOG.error("Initial HBaseAdmin failed...", e);
- this.errorCode = INIT_ERROR_EXIT_CODE;
- }
- } else if (admin.isAborted()) {
- LOG.error("HBaseAdmin aborted");
- this.errorCode = INIT_ERROR_EXIT_CODE;
- }
- return !this.hasError();
- }
- }
-
- // a monitor for region mode
- private static class RegionMonitor extends Monitor {
- // 10 minutes
- private static final int DEFAULT_WRITE_TABLE_CHECK_PERIOD = 10 * 60 * 1000;
- // 1 days
- private static final int DEFAULT_WRITE_DATA_TTL = 24 * 60 * 60;
-
- private long lastCheckTime = -1;
- private boolean writeSniffing;
- private TableName writeTableName;
- private int writeDataTTL;
- private float regionsLowerLimit;
- private float regionsUpperLimit;
- private int checkPeriod;
- private boolean rawScanEnabled;
- private HashMap<String, Long> configuredReadTableTimeouts;
- private long configuredWriteTableTimeout;
-
- public RegionMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
- StdOutSink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName,
- boolean treatFailureAsError, HashMap<String, Long> configuredReadTableTimeouts,
- long configuredWriteTableTimeout, long allowedFailures) {
- super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures);
- Configuration conf = connection.getConfiguration();
- this.writeSniffing = writeSniffing;
- this.writeTableName = writeTableName;
- this.writeDataTTL =
- conf.getInt(HConstants.HBASE_CANARY_WRITE_DATA_TTL_KEY, DEFAULT_WRITE_DATA_TTL);
- this.regionsLowerLimit =
- conf.getFloat(HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY, 1.0f);
- this.regionsUpperLimit =
- conf.getFloat(HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_UPPERLIMIT_KEY, 1.5f);
- this.checkPeriod =
- conf.getInt(HConstants.HBASE_CANARY_WRITE_TABLE_CHECK_PERIOD_KEY,
- DEFAULT_WRITE_TABLE_CHECK_PERIOD);
- this.rawScanEnabled = conf.getBoolean(HConstants.HBASE_CANARY_READ_RAW_SCAN_KEY, false);
- this.configuredReadTableTimeouts = new HashMap<>(configuredReadTableTimeouts);
- this.configuredWriteTableTimeout = configuredWriteTableTimeout;
- }
-
- private RegionStdOutSink getSink() {
- if (!(sink instanceof RegionStdOutSink)) {
- throw new RuntimeException("Can only write to Region sink");
- }
- return ((RegionStdOutSink) sink);
- }
-
- @Override
- public void run() {
- if (this.initAdmin()) {
- try {
- List<Future<Void>> taskFutures = new LinkedList<>();
- RegionStdOutSink regionSink = this.getSink();
- if (this.targets != null && this.targets.length > 0) {
- String[] tables = generateMonitorTables(this.targets);
- // Check to see that each table name passed in the -readTableTimeouts argument is also passed as a monitor target.
- if (! new HashSet<>(Arrays.asList(tables)).containsAll(this.configuredReadTableTimeouts.keySet())) {
- LOG.error("-readTableTimeouts can only specify read timeouts for monitor targets passed via command line.");
- this.errorCode = USAGE_EXIT_CODE;
- return;
- }
- this.initialized = true;
- for (String table : tables) {
- AtomicLong readLatency = regionSink.initializeAndGetReadLatencyForTable(table);
- taskFutures.addAll(Canary.sniff(admin, regionSink, table, executor, TaskType.READ,
- this.rawScanEnabled, readLatency));
- }
- } else {
- taskFutures.addAll(sniff(TaskType.READ, regionSink));
- }
-
- if (writeSniffing) {
- if (EnvironmentEdgeManager.currentTime() - lastCheckTime > checkPeriod) {
- try {
- checkWriteTableDistribution();
- } catch (IOException e) {
- LOG.error("Check canary table distribution failed!", e);
- }
- lastCheckTime = EnvironmentEdgeManager.currentTime();
- }
- // sniff canary table with write operation
- regionSink.initializeWriteLatency();
- AtomicLong writeTableLatency = regionSink.getWriteLatency();
- taskFutures.addAll(Canary.sniff(admin, regionSink, admin.getTableDescriptor(writeTableName),
- executor, TaskType.WRITE, this.rawScanEnabled, writeTableLatency));
- }
-
- for (Future<Void> future : taskFutures) {
- try {
- future.get();
- } catch (ExecutionException e) {
- LOG.error("Sniff region failed!", e);
- }
- }
- Map<String, AtomicLong> actualReadTableLatency = regionSink.getReadLatencyMap();
- for (Map.Entry<String, Long> entry : configuredReadTableTimeouts.entrySet()) {
- String tableName = entry.getKey();
- if (actualReadTableLatency.containsKey(tableName)) {
- Long actual = actualReadTableLatency.get(tableName).longValue();
- Long configured = entry.getValue();
- LOG.info("Read operation for " + tableName + " took " + actual +
- " ms. The configured read timeout was " + configured + " ms.");
- if (actual > configured) {
- LOG.error("Read operation for " + tableName + " exceeded the configured read timeout.");
- }
- } else {
- LOG.error("Read operation for " + tableName + " failed!");
- }
- }
- if (this.writeSniffing) {
- String writeTableStringName = this.writeTableName.getNameAsString();
- long actualWriteLatency = regionSink.getWriteLatency().longValue();
- LOG.info("Write operation for " + writeTableStringName + " took " + actualWriteLatency + " ms. The configured write timeout was " +
- this.configuredWriteTableTimeout + " ms.");
- // Check that the writeTable write operation latency does not exceed the configured timeout.
- if (actualWriteLatency > this.configuredWriteTableTimeout) {
- LOG.error("Write operation for " + writeTableStringName + " exceeded the configured write timeout.");
- }
- }
- } catch (Exception e) {
- LOG.error("Run regionMonitor failed", e);
- this.errorCode = ERROR_EXIT_CODE;
- } finally {
- this.done = true;
- }
- }
- this.done = true;
- }
-
- private String[] generateMonitorTables(String[] monitorTargets) throws IOException {
- String[] returnTables = null;
-
- if (this.useRegExp) {
- Pattern pattern = null;
- HTableDescriptor[] tds = null;
- Set<String> tmpTables = new TreeSet<String>();
- try {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading list of tables"));
- }
- tds = this.admin.listTables(pattern);
- if (tds == null) {
- tds = new HTableDescriptor[0];
- }
- for (String monitorTarget : monitorTargets) {
- pattern = Pattern.compile(monitorTarget);
- for (HTableDescriptor td : tds) {
- if (pattern.matcher(td.getNameAsString()).matches()) {
- tmpTables.add(td.getNameAsString());
- }
- }
- }
- } catch (IOException e) {
- LOG.error("Communicate with admin failed", e);
- throw e;
- }
-
- if (tmpTables.size() > 0) {
- returnTables = tmpTables.toArray(new String[tmpTables.size()]);
- } else {
- String msg = "No HTable found, tablePattern:" + Arrays.toString(monitorTargets);
- LOG.error(msg);
- this.errorCode = INIT_ERROR_EXIT_CODE;
- throw new TableNotFoundException(msg);
- }
- } else {
- returnTables = monitorTargets;
- }
-
- return returnTables;
- }
-
- /*
- * canary entry point to monitor all the tables.
- */
- private List<Future<Void>> sniff(TaskType taskType, RegionStdOutSink regionSink) throws Exception {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading list of tables"));
- }
- List<Future<Void>> taskFutures = new LinkedList<>();
- for (HTableDescriptor table : admin.listTables()) {
- if (admin.tableExists(table.getTableName()) && admin.isTableEnabled(table.getTableName())
- && (!table.getTableName().equals(writeTableName))) {
- AtomicLong readLatency = regionSink.initializeAndGetReadLatencyForTable(table.getNameAsString());
- taskFutures.addAll(Canary.sniff(admin, sink, table, executor, taskType, this.rawScanEnabled, readLatency));
- }
- }
- return taskFutures;
- }
-
- private void checkWriteTableDistribution() throws IOException {
- if (!admin.tableExists(writeTableName)) {
- int numberOfServers = admin.getClusterStatus().getServers().size();
- if (numberOfServers == 0) {
- throw new IllegalStateException("No live regionservers");
- }
- createWriteTable(numberOfServers);
- }
-
- if (!admin.isTableEnabled(writeTableName)) {
- admin.enableTable(writeTableName);
- }
-
- ClusterStatus status = admin.getClusterStatus();
- int numberOfServers = status.getServersSize();
- if (status.getServers().contains(status.getMaster())) {
- numberOfServers -= 1;
- }
-
- List<HRegionLocation> locations;
- RegionLocator locator = connection.getRegionLocator(writeTableName);
- try {
- locations = locator.getAllRegionLocations();
- } finally {
- locator.close();
- }
- int numberOfRegions = locations.size();
- if (numberOfRegions < numberOfServers * regionsLowerLimit
- || numberOfRegions > numberOfServers * regionsUpperLimit) {
- admin.disableTable(writeTableName);
- admin.deleteTable(writeTableName);
- createWriteTable(numberOfServers);
- }
- HashSet<ServerName> serverSet = new HashSet<ServerName>();
- for (HRegionLocation location: locations) {
- serverSet.add(location.getServerName());
- }
- int numberOfCoveredServers = serverSet.size();
- if (numberOfCoveredServers < numberOfServers) {
- admin.balancer();
- }
- }
-
- private void createWriteTable(int numberOfServers) throws IOException {
- int numberOfRegions = (int)(numberOfServers * regionsLowerLimit);
- LOG.info("Number of live regionservers: " + numberOfServers + ", "
- + "pre-splitting the canary table into " + numberOfRegions + " regions "
- + "(current lower limit of regions per server is " + regionsLowerLimit
- + " and you can change it by config: "
- + HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY + " )");
- HTableDescriptor desc = new HTableDescriptor(writeTableName);
- HColumnDescriptor family = new HColumnDescriptor(CANARY_TABLE_FAMILY_NAME);
- family.setMaxVersions(1);
- family.setTimeToLive(writeDataTTL);
-
- desc.addFamily(family);
- byte[][] splits = new RegionSplitter.HexStringSplit().split(numberOfRegions);
- admin.createTable(desc, splits);
- }
- }
+ public int checkRegionServers(String[] targets) throws Exception;
/**
- * Canary entry point for specified table.
- * @throws Exception
- */
- private static List<Future<Void>> sniff(final Admin admin, final Sink sink, String tableName,
- ExecutorService executor, TaskType taskType, boolean rawScanEnabled, AtomicLong readLatency) throws Exception {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("checking table is enabled and getting table descriptor for table %s",
- tableName));
- }
- if (admin.isTableEnabled(TableName.valueOf(tableName))) {
- return Canary.sniff(admin, sink, admin.getTableDescriptor(TableName.valueOf(tableName)),
- executor, taskType, rawScanEnabled, readLatency);
- } else {
- LOG.warn(String.format("Table %s is not enabled", tableName));
- }
- return new LinkedList<Future<Void>>();
- }
-
- /*
- * Loops over regions that owns this table, and output some information about the state.
+ * Runs Canary in Zookeeper mode.
+ *
+ * @return the exit code of the Canary tool.
*/
- private static List<Future<Void>> sniff(final Admin admin, final Sink sink,
- HTableDescriptor tableDesc, ExecutorService executor, TaskType taskType,
- boolean rawScanEnabled, AtomicLong rwLatency) throws Exception {
-
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading list of regions for table %s", tableDesc.getTableName()));
- }
-
- Table table = null;
- try {
- table = admin.getConnection().getTable(tableDesc.getTableName());
- } catch (TableNotFoundException e) {
- return new ArrayList<Future<Void>>();
- } finally {
- if (table != null) {
- table.close();
- }
- }
-
- List<RegionTask> tasks = new ArrayList<RegionTask>();
- RegionLocator regionLocator = null;
- try {
- regionLocator = admin.getConnection().getRegionLocator(tableDesc.getTableName());
- for (HRegionLocation location : regionLocator.getAllRegionLocations()) {
- ServerName rs = location.getServerName();
- HRegionInfo region = location.getRegionInfo();
- tasks.add(new RegionTask(admin.getConnection(), region, rs, (RegionStdOutSink) sink, taskType, rawScanEnabled,
- rwLatency));
- Map<String, RegionTaskResult> regionMap = ((RegionStdOutSink) sink).getRegionMap();
- regionMap.put(region.getRegionNameAsString(), new RegionTaskResult(region,
- region.getTable(), rs));
- }
- } finally {
- if (regionLocator != null) {
- regionLocator.close();
- }
- }
- return executor.invokeAll(tasks);
- }
-
- // monitor for zookeeper mode
- private static class ZookeeperMonitor extends Monitor {
- private List<String> hosts;
- private final String znode;
- private final int timeout;
-
- protected ZookeeperMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
- StdOutSink sink, ExecutorService executor, boolean treatFailureAsError, long allowedFailures) {
- super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures);
- Configuration configuration = connection.getConfiguration();
- znode =
- configuration.get(ZOOKEEPER_ZNODE_PARENT,
- DEFAULT_ZOOKEEPER_ZNODE_PARENT);
- timeout = configuration
- .getInt(HConstants.ZK_SESSION_TIMEOUT, HConstants.DEFAULT_ZK_SESSION_TIMEOUT);
- ConnectStringParser parser =
- new ConnectStringParser(ZKConfig.getZKQuorumServersString(configuration));
- hosts = Lists.newArrayList();
- for (InetSocketAddress server : parser.getServerAddresses()) {
- hosts.add(server.toString());
- }
- if (allowedFailures > (hosts.size() - 1) / 2) {
- LOG.warn(String.format("Confirm allowable number of failed ZooKeeper nodes, as quorum will " +
- "already be lost. Setting of %d failures is unexpected for %d ensemble size.",
- allowedFailures, hosts.size()));
- }
- }
-
- @Override public void run() {
- List<ZookeeperTask> tasks = Lists.newArrayList();
- ZookeeperStdOutSink zkSink = null;
- try {
- zkSink = this.getSink();
- } catch (RuntimeException e) {
- LOG.error("Run ZooKeeperMonitor failed!", e);
- this.errorCode = ERROR_EXIT_CODE;
- }
- this.initialized = true;
- for (final String host : hosts) {
- tasks.add(new ZookeeperTask(connection, host, znode, timeout, zkSink));
- }
- try {
- for (Future<Void> future : this.executor.invokeAll(tasks)) {
- try {
- future.get();
- } catch (ExecutionException e) {
- LOG.error("Sniff zookeeper failed!", e);
- this.errorCode = ERROR_EXIT_CODE;
- }
- }
- } catch (InterruptedException e) {
- this.errorCode = ERROR_EXIT_CODE;
- Thread.currentThread().interrupt();
- LOG.error("Sniff zookeeper interrupted!", e);
- }
- this.done = true;
- }
-
- private ZookeeperStdOutSink getSink() {
- if (!(sink instanceof ZookeeperStdOutSink)) {
- throw new RuntimeException("Can only write to zookeeper sink");
- }
- return ((ZookeeperStdOutSink) sink);
- }
- }
+ public int checkZooKeeper() throws Exception;
+ public Map<String, String> getReadFailures();
- // a monitor for regionserver mode
- private static class RegionServerMonitor extends Monitor {
-
- private boolean allRegions;
-
- public RegionServerMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
- StdOutSink sink, ExecutorService executor, boolean allRegions,
- boolean treatFailureAsError, long allowedFailures) {
- super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures);
- this.allRegions = allRegions;
- }
-
- private RegionServerStdOutSink getSink() {
- if (!(sink instanceof RegionServerStdOutSink)) {
- throw new RuntimeException("Can only write to regionserver sink");
- }
- return ((RegionServerStdOutSink) sink);
- }
-
- @Override
- public void run() {
- if (this.initAdmin() && this.checkNoTableNames()) {
- RegionServerStdOutSink regionServerSink = null;
- try {
- regionServerSink = this.getSink();
- } catch (RuntimeException e) {
- LOG.error("Run RegionServerMonitor failed!", e);
- this.errorCode = ERROR_EXIT_CODE;
- }
- Map<String, List<HRegionInfo>> rsAndRMap = this.filterRegionServerByName();
- this.initialized = true;
- this.monitorRegionServers(rsAndRMap, regionServerSink);
- }
- this.done = true;
- }
-
- private boolean checkNoTableNames() {
- List<String> foundTableNames = new ArrayList<String>();
- TableName[] tableNames = null;
-
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading list of tables"));
- }
- try {
- tableNames = this.admin.listTableNames();
- } catch (IOException e) {
- LOG.error("Get listTableNames failed", e);
- this.errorCode = INIT_ERROR_EXIT_CODE;
- return false;
- }
-
- if (this.targets == null || this.targets.length == 0) return true;
-
- for (String target : this.targets) {
- for (TableName tableName : tableNames) {
- if (target.equals(tableName.getNameAsString())) {
- foundTableNames.add(target);
- }
- }
- }
-
- if (foundTableNames.size() > 0) {
- System.err.println("Cannot pass a tablename when using the -regionserver " +
- "option, tablenames:" + foundTableNames.toString());
- this.errorCode = USAGE_EXIT_CODE;
- }
- return foundTableNames.size() == 0;
- }
-
- private void monitorRegionServers(Map<String, List<HRegionInfo>> rsAndRMap,
- RegionServerStdOutSink regionServerSink) {
- List<RegionServerTask> tasks = new ArrayList<>();
- Map<String, AtomicLong> successMap = new HashMap<>();
- Random rand = new Random();
- for (Map.Entry<String, List<HRegionInfo>> entry : rsAndRMap.entrySet()) {
- String serverName = entry.getKey();
- AtomicLong successes = new AtomicLong(0);
- successMap.put(serverName, successes);
- if (entry.getValue().isEmpty()) {
- LOG.error(String.format("Regionserver not serving any regions - %s", serverName));
- } else if (this.allRegions) {
- for (HRegionInfo region : entry.getValue()) {
- tasks.add(new RegionServerTask(this.connection,
- serverName,
- region,
- regionServerSink,
- successes));
- }
- } else {
- // random select a region if flag not set
- HRegionInfo region = entry.getValue().get(rand.nextInt(entry.getValue().size()));
- tasks.add(new RegionServerTask(this.connection,
- serverName,
- region,
- regionServerSink,
- successes));
- }
- }
- try {
- for (Future<Void> future : this.executor.invokeAll(tasks)) {
- try {
- future.get();
- } catch (ExecutionException e) {
- LOG.error("Sniff regionserver failed!", e);
- this.errorCode = ERROR_EXIT_CODE;
- }
- }
- if (this.allRegions) {
- for (Map.Entry<String, List<HRegionInfo>> entry : rsAndRMap.entrySet()) {
- String serverName = entry.getKey();
- LOG.info("Successfully read " + successMap.get(serverName) + " regions out of "
- + entry.getValue().size() + " on regionserver:" + serverName);
- }
- }
- } catch (InterruptedException e) {
- this.errorCode = ERROR_EXIT_CODE;
- LOG.error("Sniff regionserver interrupted!", e);
- }
- }
-
- private Map<String, List<HRegionInfo>> filterRegionServerByName() {
- Map<String, List<HRegionInfo>> regionServerAndRegionsMap = this.getAllRegionServerByName();
- regionServerAndRegionsMap = this.doFilterRegionServerByName(regionServerAndRegionsMap);
- return regionServerAndRegionsMap;
- }
-
- private Map<String, List<HRegionInfo>> getAllRegionServerByName() {
- Map<String, List<HRegionInfo>> rsAndRMap = new HashMap<String, List<HRegionInfo>>();
- Table table = null;
- RegionLocator regionLocator = null;
- try {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading list of tables and locations"));
- }
- HTableDescriptor[] tableDescs = this.admin.listTables();
- List<HRegionInfo> regions = null;
- for (HTableDescriptor tableDesc : tableDescs) {
- table = this.admin.getConnection().getTable(tableDesc.getTableName());
- regionLocator = this.admin.getConnection().getRegionLocator(tableDesc.getTableName());
-
- for (HRegionLocation location : regionLocator.getAllRegionLocations()) {
- ServerName rs = location.getServerName();
- String rsName = rs.getHostname();
- HRegionInfo r = location.getRegionInfo();
-
- if (rsAndRMap.containsKey(rsName)) {
- regions = rsAndRMap.get(rsName);
- } else {
- regions = new ArrayList<HRegionInfo>();
- rsAndRMap.put(rsName, regions);
- }
- regions.add(r);
- }
- table.close();
- }
-
- //get any live regionservers not serving any regions
- for (ServerName rs : this.admin.getClusterStatus().getServers()) {
- String rsName = rs.getHostname();
- if (!rsAndRMap.containsKey(rsName)) {
- rsAndRMap.put(rsName, Collections.<HRegionInfo>emptyList());
- }
- }
- } catch (IOException e) {
- String msg = "Get HTables info failed";
- LOG.error(msg, e);
- this.errorCode = INIT_ERROR_EXIT_CODE;
- } finally {
- if (table != null) {
- try {
- table.close();
- } catch (IOException e) {
- LOG.warn("Close table failed", e);
- }
- }
- }
-
- return rsAndRMap;
- }
-
- private Map<String, List<HRegionInfo>> doFilterRegionServerByName(
- Map<String, List<HRegionInfo>> fullRsAndRMap) {
-
- Map<String, List<HRegionInfo>> filteredRsAndRMap = null;
-
- if (this.targets != null && this.targets.length > 0) {
- filteredRsAndRMap = new HashMap<String, List<HRegionInfo>>();
- Pattern pattern = null;
- Matcher matcher = null;
- boolean regExpFound = false;
- for (String rsName : this.targets) {
- if (this.useRegExp) {
- regExpFound = false;
- pattern = Pattern.compile(rsName);
- for (Map.Entry<String, List<HRegionInfo>> entry : fullRsAndRMap.entrySet()) {
- matcher = pattern.matcher(entry.getKey());
- if (matcher.matches()) {
- filteredRsAndRMap.put(entry.getKey(), entry.getValue());
- regExpFound = true;
- }
- }
- if (!regExpFound) {
- LOG.info("No RegionServerInfo found, regionServerPattern:" + rsName);
- }
- } else {
- if (fullRsAndRMap.containsKey(rsName)) {
- filteredRsAndRMap.put(rsName, fullRsAndRMap.get(rsName));
- } else {
- LOG.info("No RegionServerInfo found, regionServerName:" + rsName);
- }
- }
- }
- } else {
- filteredRsAndRMap = fullRsAndRMap;
- }
- return filteredRsAndRMap;
- }
- }
-
- public static void main(String[] args) throws Exception {
- final Configuration conf = HBaseConfiguration.create();
-
- // loading the generic options to conf
- new GenericOptionsParser(conf, args);
-
- int numThreads = conf.getInt("hbase.canary.threads.num", MAX_THREADS_NUM);
- LOG.info("Number of execution threads " + numThreads);
-
- ExecutorService executor = new ScheduledThreadPoolExecutor(numThreads);
-
- Class<? extends Sink> sinkClass =
- conf.getClass("hbase.canary.sink.class", RegionServerStdOutSink.class, Sink.class);
- Sink sink = ReflectionUtils.newInstance(sinkClass);
-
- int exitCode = ToolRunner.run(conf, new Canary(executor, sink), args);
- executor.shutdown();
- System.exit(exitCode);
- }
-}
\ No newline at end of file
+ public Map<String, String> getWriteFailures();
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java
similarity index 66%
copy from hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
copy to hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java
index afec6b6..23daf03 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java
@@ -22,14 +22,13 @@ package org.apache.hadoop.hbase.tool;
import static org.apache.hadoop.hbase.HConstants.DEFAULT_ZOOKEEPER_ZNODE_PARENT;
import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_ZNODE_PARENT;
-import com.google.common.collect.Lists;
-
import java.io.Closeable;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
+import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
@@ -48,20 +47,23 @@ import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Lists;
+
import org.apache.commons.lang.time.StopWatch;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.AuthUtil;
import org.apache.hadoop.hbase.ChoreService;
import org.apache.hadoop.hbase.ClusterStatus;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.HBaseInterfaceAudience;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.HTableDescriptor;
+import org.apache.hadoop.hbase.MetaTableAccessor;
import org.apache.hadoop.hbase.NamespaceDescriptor;
import org.apache.hadoop.hbase.ScheduledChore;
import org.apache.hadoop.hbase.ServerName;
@@ -79,63 +81,98 @@ import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
-import org.apache.hadoop.hbase.tool.Canary.RegionTask.TaskType;
+import org.apache.hadoop.hbase.tool.CanaryTool.RegionTask.TaskType;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.ReflectionUtils;
import org.apache.hadoop.hbase.util.RegionSplitter;
import org.apache.hadoop.hbase.zookeeper.EmptyWatcher;
import org.apache.hadoop.hbase.zookeeper.ZKConfig;
-import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.ZooKeeper;
import org.apache.zookeeper.client.ConnectStringParser;
import org.apache.zookeeper.data.Stat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
- * HBase Canary Tool, that that can be used to do
- * "canary monitoring" of a running HBase cluster.
+ * HBase Canary Tool for "canary monitoring" of a running HBase cluster.
*
- * Here are three modes
- * 1. region mode - Foreach region tries to get one row per column family
- * and outputs some information about failure or latency.
+ * There are three modes:
+ * <ol>
+ * <li>region mode (Default): For each region, try to get one row per column family outputting
+ * information on failure (ERROR) or else the latency.
+ * </li>
*
- * 2. regionserver mode - Foreach regionserver tries to get one row from one table
- * selected randomly and outputs some information about failure or latency.
+ * <li>regionserver mode: For each regionserver try to get one row from one table selected
+ * randomly outputting information on failure (ERROR) or else the latency.
+ * </li>
*
- * 3. zookeeper mode - for each zookeeper instance, selects a zNode and
- * outputs some information about failure or latency.
+ * <li>zookeeper mode: for each zookeeper instance, selects a znode outputting information on
+ * failure (ERROR) or else the latency.
+ * </li>
+ * </ol>
*/
-@InterfaceAudience.Private
-public final class Canary implements Tool {
- // Sink interface used by the canary to outputs information
+@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS)
+public class CanaryTool implements Tool, Canary {
+
+ @Override
+ public int checkRegions(String[] targets) throws Exception {
+ String configuredReadTableTimeoutsStr = conf.get(HBASE_CANARY_REGION_READ_TABLE_TIMEOUT);
+ try {
+ if (configuredReadTableTimeoutsStr != null) {
+ populateReadTableTimeoutsMap(configuredReadTableTimeoutsStr);
+ }
+ } catch (IllegalArgumentException e) {
+ LOG.error("Constructing read table timeouts map failed ", e);
+ return USAGE_EXIT_CODE;
+ }
+ return runMonitor(targets);
+ }
+
+ @Override
+ public int checkRegionServers(String[] targets) throws Exception {
+ regionServerMode = true;
+ return runMonitor(targets);
+ }
+
+ @Override
+ public int checkZooKeeper() throws Exception {
+ zookeeperMode = true;
+ return runMonitor(null);
+ }
+
+ /**
+ * Sink interface used by the canary to output information
+ */
public interface Sink {
- public long getReadFailureCount();
- public long incReadFailureCount();
- public Map<String,String> getReadFailures();
- public void updateReadFailures(String regionName, String serverName);
- public long getWriteFailureCount();
- public long incWriteFailureCount();
- public Map<String,String> getWriteFailures();
- public void updateWriteFailures(String regionName, String serverName);
- public long getReadSuccessCount();
- public long incReadSuccessCount();
- public long getWriteSuccessCount();
- public long incWriteSuccessCount();
+ long getReadFailureCount();
+ long incReadFailureCount();
+ Map<String,String> getReadFailures();
+ void updateReadFailures(String regionName, String serverName);
+ long getWriteFailureCount();
+ long incWriteFailureCount();
+ Map<String,String> getWriteFailures();
+ void updateWriteFailures(String regionName, String serverName);
+ long getReadSuccessCount();
+ long incReadSuccessCount();
+ long getWriteSuccessCount();
+ long incWriteSuccessCount();
}
- // Simple implementation of canary sink that allows to plot on
- // file or standard output timings or failures.
+ /**
+ * Simple implementation of canary sink that allows plotting to a file or standard output.
+ */
public static class StdOutSink implements Sink {
private AtomicLong readFailureCount = new AtomicLong(0),
writeFailureCount = new AtomicLong(0),
readSuccessCount = new AtomicLong(0),
writeSuccessCount = new AtomicLong(0);
-
- private Map<String, String> readFailures = new ConcurrentHashMap<String, String>();
- private Map<String, String> writeFailures = new ConcurrentHashMap<String, String>();
+ private Map<String, String> readFailures = new ConcurrentHashMap<>();
+ private Map<String, String> writeFailures = new ConcurrentHashMap<>();
@Override
public long getReadFailureCount() {
@@ -198,76 +235,84 @@ public final class Canary implements Tool {
}
}
+ /**
+ * By RegionServer, for 'regionserver' mode.
+ */
public static class RegionServerStdOutSink extends StdOutSink {
-
public void publishReadFailure(String table, String server) {
incReadFailureCount();
- LOG.error(String.format("Read from table:%s on region server:%s", table, server));
+ LOG.error("Read from {} on {}", table, server);
}
public void publishReadTiming(String table, String server, long msTime) {
- LOG.info(String.format("Read from table:%s on region server:%s in %dms",
- table, server, msTime));
+ LOG.info("Read from {} on {} in {}ms", table, server, msTime);
}
}
+ /**
+ * Output for 'zookeeper' mode.
+ */
public static class ZookeeperStdOutSink extends StdOutSink {
-
- public void publishReadFailure(String zNode, String server) {
+ public void publishReadFailure(String znode, String server) {
incReadFailureCount();
- LOG.error(String.format("Read from zNode:%s on zookeeper instance:%s", zNode, server));
+ LOG.error("Read from {} on {}", znode, server);
}
public void publishReadTiming(String znode, String server, long msTime) {
- LOG.info(String.format("Read from zNode:%s on zookeeper instance:%s in %dms",
- znode, server, msTime));
+ LOG.info("Read from {} on {} in {}ms", znode, server, msTime);
}
}
+ /**
+ * By Region, for 'region' mode.
+ */
public static class RegionStdOutSink extends StdOutSink {
-
private Map<String, AtomicLong> perTableReadLatency = new HashMap<>();
private AtomicLong writeLatency = new AtomicLong();
- private Map<String, RegionTaskResult> regionMap = new ConcurrentHashMap<>();
+ private final Map<String, RegionTaskResult> regionMap = new ConcurrentHashMap<>();
public void publishReadFailure(ServerName serverName, HRegionInfo region, Exception e) {
incReadFailureCount();
- LOG.error(String.format("read from region %s on regionserver %s failed", region.getRegionNameAsString(), serverName), e);
+ LOG.error("Read from {} on {} failed", region.getRegionNameAsString(), serverName, e);
}
- public void publishReadFailure(ServerName serverName, HRegionInfo region, HColumnDescriptor column, Exception e) {
+ public void publishReadFailure(ServerName serverName, HRegionInfo region,
+ HColumnDescriptor column, Exception e) {
incReadFailureCount();
- LOG.error(String.format("read from region %s on regionserver %s column family %s failed",
- region.getRegionNameAsString(), serverName, column.getNameAsString()), e);
+ LOG.error("Read from {} on {} {} failed", region.getRegionNameAsString(), serverName,
+ column.getNameAsString(), e);
}
- public void publishReadTiming(ServerName serverName, HRegionInfo region, HColumnDescriptor column, long msTime) {
+ public void publishReadTiming(ServerName serverName, HRegionInfo region,
+ HColumnDescriptor column, long msTime) {
incReadSuccessCount();
RegionTaskResult res = this.regionMap.get(region.getRegionNameAsString());
res.setReadSuccess();
res.setReadLatency(msTime);
- LOG.info(String.format("read from region %s on regionserver %s column family %s in %dms",
- region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime));
+ LOG.info("Read from {} on {} {} in {}ms", region.getRegionNameAsString(), serverName,
+ column.getNameAsString(), msTime);
}
public void publishWriteFailure(ServerName serverName, HRegionInfo region, Exception e) {
incWriteFailureCount();
- LOG.error(String.format("write to region %s on regionserver %s failed", region.getRegionNameAsString(), serverName), e);
+ LOG.error("Write to {} on {} failed", region.getRegionNameAsString(), serverName, e);
}
- public void publishWriteFailure(ServerName serverName, HRegionInfo region, HColumnDescriptor column, Exception e) {
+ public void publishWriteFailure(ServerName serverName, HRegionInfo region,
+ HColumnDescriptor column, Exception e) {
incWriteFailureCount();
- LOG.error(String.format("write to region %s on regionserver %s column family %s failed",
- region.getRegionNameAsString(), serverName, column.getNameAsString()), e);
+ LOG.error("Write to {} on {} {} failed", region.getRegionNameAsString(), serverName,
+ column.getNameAsString(), e);
}
- public void publishWriteTiming(ServerName serverName, HRegionInfo region, HColumnDescriptor column, long msTime) {
+ public void publishWriteTiming(ServerName serverName, HRegionInfo region,
+ HColumnDescriptor column, long msTime) {
incWriteSuccessCount();
RegionTaskResult res = this.regionMap.get(region.getRegionNameAsString());
res.setWriteSuccess();
res.setWriteLatency(msTime);
- LOG.info(String.format("write to region %s on regionserver %s column family %s in %dms",
- region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime));
+ LOG.info("Write to {} on {} {} in {}ms",
+ region.getRegionNameAsString(), serverName, column.getNameAsString(), msTime);
}
public Map<String, AtomicLong> getReadLatencyMap() {
@@ -275,13 +320,13 @@ public final class Canary implements Tool {
}
public AtomicLong initializeAndGetReadLatencyForTable(String tableName) {
- AtomicLong initLatency = new AtomicLong(0L);
+ AtomicLong initLatency = new AtomicLong();
this.perTableReadLatency.put(tableName, initLatency);
return initLatency;
}
public void initializeWriteLatency() {
- this.writeLatency.set(0L);
+ this.writeLatency.set(0);
}
public AtomicLong getWriteLatency() {
@@ -297,6 +342,9 @@ public final class Canary implements Tool {
}
}
+ /**
+ * Run a single zookeeper Task and then exit.
+ */
static class ZookeeperTask implements Callable<Void> {
private final Connection connection;
private final String host;
@@ -335,8 +383,8 @@ public final class Canary implements Tool {
}
/**
- * For each column family of the region tries to get one row and outputs the latency, or the
- * failure.
+ * Run a single Region Task and then exit. For each column family of the Region, get one row and
+ * output latency or failure.
*/
static class RegionTask implements Callable<Void> {
public enum TaskType{
@@ -350,8 +398,8 @@ public final class Canary implements Tool {
private ServerName serverName;
private AtomicLong readWriteLatency;
- RegionTask(Connection connection, HRegionInfo region, ServerName serverName, RegionStdOutSink sink,
- TaskType taskType, boolean rawScanEnabled, AtomicLong rwLatency) {
+ RegionTask(Connection connection, HRegionInfo region, ServerName serverName,
+ RegionStdOutSink sink, TaskType taskType, boolean rawScanEnabled, AtomicLong rwLatency) {
this.connection = connection;
this.region = region;
this.serverName = serverName;
@@ -377,14 +425,11 @@ public final class Canary implements Tool {
Table table = null;
HTableDescriptor tableDesc = null;
try {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading table descriptor for table %s",
- region.getTable()));
- }
+ LOG.debug("Reading table descriptor for table {}", region.getTable());
table = connection.getTable(region.getTable());
tableDesc = table.getTableDescriptor();
} catch (IOException e) {
- LOG.debug("sniffRegion failed", e);
+ LOG.debug("sniffRegion {} of {} failed", region.getEncodedName(), e);
sink.publishReadFailure(serverName, region, e);
if (table != null) {
try {
@@ -412,10 +457,7 @@ public final class Canary implements Tool {
get.addFamily(column.getName());
} else {
scan = new Scan();
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("rawScan : %s for table: %s", rawScanEnabled,
- tableDesc.getTableName()));
- }
+ LOG.debug("rawScan {} for {}", rawScanEnabled, tableDesc.getTableName());
scan.setRaw(rawScanEnabled);
scan.setCaching(1);
scan.setCacheBlocks(false);
@@ -424,12 +466,9 @@ public final class Canary implements Tool {
scan.setMaxResultSize(1L);
scan.setOneRowLimit();
}
-
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading from table %s region %s column family %s and key %s",
- tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
- Bytes.toStringBinary(startKey)));
- }
+ LOG.debug("Reading from {} {} {} {}", tableDesc.getTableName(),
+ region.getRegionNameAsString(), column.getNameAsString(),
+ Bytes.toStringBinary(startKey));
try {
stopWatch.start();
if (startKey.length > 0) {
@@ -462,7 +501,6 @@ public final class Canary implements Tool {
/**
* Check writes for the canary table
- * @return
*/
private Void write() {
Table table = null;
@@ -482,11 +520,9 @@ public final class Canary implements Tool {
Bytes.random(value);
put.addColumn(column.getName(), HConstants.EMPTY_BYTE_ARRAY, value);
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("writing to table %s region %s column family %s and key %s",
- tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
- Bytes.toStringBinary(rowToCheck)));
- }
+ LOG.debug("Writing to {} {} {} {}",
+ tableDesc.getTableName(), region.getRegionNameAsString(), column.getNameAsString(),
+ Bytes.toStringBinary(rowToCheck));
try {
long startTime = System.currentTimeMillis();
table.put(put);
@@ -507,7 +543,8 @@ public final class Canary implements Tool {
}
/**
- * Get one row from a region on the regionserver and outputs the latency, or the failure.
+ * Run a single RegionServer Task and then exit.
+ * Get one row from a region on the regionserver and output latency or the failure.
*/
static class RegionServerTask implements Callable<Void> {
private Connection connection;
@@ -540,11 +577,9 @@ public final class Canary implements Tool {
table = connection.getTable(tableName);
startKey = region.getStartKey();
// Can't do a get on empty start row so do a Scan of first element if any instead.
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading from region server %s table %s region %s and key %s",
- serverName, region.getTable(), region.getRegionNameAsString(),
- Bytes.toStringBinary(startKey)));
- }
+ LOG.debug("Reading from {} {} {} {}",
+ serverName, region.getTable(), region.getRegionNameAsString(),
+ Bytes.toStringBinary(startKey));
if (startKey.length > 0) {
get = new Get(startKey);
get.setCacheBlocks(false);
@@ -576,10 +611,10 @@ public final class Canary implements Tool {
LOG.debug("The targeted table was disabled. Assuming success.");
} catch (DoNotRetryIOException dnrioe) {
sink.publishReadFailure(tableName.getNameAsString(), serverName);
- LOG.error(dnrioe);
+ LOG.error(dnrioe.toString(), dnrioe);
} catch (IOException e) {
sink.publishReadFailure(tableName.getNameAsString(), serverName);
- LOG.error(e);
+ LOG.error(e.toString(), e);
} finally {
if (table != null) {
try {
@@ -607,7 +642,7 @@ public final class Canary implements Tool {
private static final long DEFAULT_TIMEOUT = 600000; // 10 mins
private static final int MAX_THREADS_NUM = 16; // #threads to contact regions
- private static final Log LOG = LogFactory.getLog(Canary.class);
+ private static final Logger LOG = LoggerFactory.getLogger(Canary.class);
public static final TableName DEFAULT_WRITE_TABLE_NAME = TableName.valueOf(
NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, "canary");
@@ -618,30 +653,68 @@ public final class Canary implements Tool {
private long interval = 0;
private Sink sink = null;
- private boolean useRegExp;
- private long timeout = DEFAULT_TIMEOUT;
- private boolean failOnError = true;
+ /**
+ * True if we are to run in 'regionServer' mode.
+ */
private boolean regionServerMode = false;
+
+ /**
+ * True if we are to run in zookeeper 'mode'.
+ */
private boolean zookeeperMode = false;
- private long permittedFailures = 0;
- private boolean regionServerAllRegions = false;
- private boolean writeSniffing = false;
- private long configuredWriteTableTimeout = DEFAULT_TIMEOUT;
- private boolean treatFailureAsError = false;
- private TableName writeTableName = DEFAULT_WRITE_TABLE_NAME;
+
+ /**
+ * This is a Map of table to timeout. The timeout is for reading all regions in the table; i.e.
+ * we aggregate time to fetch each region and it needs to be less than this value else we
+ * log an ERROR.
+ */
private HashMap<String, Long> configuredReadTableTimeouts = new HashMap<>();
+ public static final String HBASE_CANARY_REGIONSERVER_ALL_REGIONS
+ = "hbase.canary.regionserver_all_regions";
+
+ public static final String HBASE_CANARY_REGION_WRITE_SNIFFING
+ = "hbase.canary.region.write.sniffing";
+ public static final String HBASE_CANARY_REGION_WRITE_TABLE_TIMEOUT
+ = "hbase.canary.region.write.table.timeout";
+ public static final String HBASE_CANARY_REGION_WRITE_TABLE_NAME
+ = "hbase.canary.region.write.table.name";
+ public static final String HBASE_CANARY_REGION_READ_TABLE_TIMEOUT
+ = "hbase.canary.region.read.table.timeout";
+
+ public static final String HBASE_CANARY_ZOOKEEPER_PERMITTED_FAILURES
+ = "hbase.canary.zookeeper.permitted.failures";
+
+ public static final String HBASE_CANARY_USE_REGEX = "hbase.canary.use.regex";
+ public static final String HBASE_CANARY_TIMEOUT = "hbase.canary.timeout";
+ public static final String HBASE_CANARY_FAIL_ON_ERROR = "hbase.canary.fail.on.error";
+
+
private ExecutorService executor; // threads to retrieve data from regionservers
- public Canary() {
- this(new ScheduledThreadPoolExecutor(1), new RegionServerStdOutSink());
+ public CanaryTool() {
+ this(new ScheduledThreadPoolExecutor(1));
}
- public Canary(ExecutorService executor, Sink sink) {
+ public CanaryTool(ExecutorService executor) {
+ this(executor, null);
+ }
+
+ @VisibleForTesting
+ CanaryTool(ExecutorService executor, Sink sink) {
this.executor = executor;
this.sink = sink;
}
+ CanaryTool(Configuration conf, ExecutorService executor) {
+ this(conf, executor, null);
+ }
+
+ CanaryTool(Configuration conf, ExecutorService executor, Sink sink) {
+ this(executor, sink);
+ setConf(conf);
+ }
+
@Override
public Configuration getConf() {
return conf;
@@ -649,11 +722,18 @@ public final class Canary implements Tool {
@Override
public void setConf(Configuration conf) {
+ if (conf == null) {
+ conf = HBaseConfiguration.create();
+ }
this.conf = conf;
}
private int parseArgs(String[] args) {
int index = -1;
+ long permittedFailures = 0;
+ boolean regionServerAllRegions = false, writeSniffing = false;
+ String readTableTimeoutsStr = null;
+
// Process command line args
for (int i = 0; i < args.length; i++) {
String cmd = args[i];
@@ -665,7 +745,7 @@ public final class Canary implements Tool {
printUsageAndExit();
}
- if (cmd.equals("-help")) {
+ if (cmd.equals("-help") || cmd.equals("-h")) {
// user asked for help, print the help and quit.
printUsageAndExit();
} else if (cmd.equals("-daemon") && interval == 0) {
@@ -676,7 +756,7 @@ public final class Canary implements Tool {
i++;
if (i == args.length) {
- System.err.println("-interval needs a numeric value argument.");
+ System.err.println("-interval takes a numeric seconds value argument.");
printUsageAndExit();
}
@@ -691,49 +771,53 @@ public final class Canary implements Tool {
} else if(cmd.equals("-regionserver")) {
this.regionServerMode = true;
} else if(cmd.equals("-allRegions")) {
- this.regionServerAllRegions = true;
+ conf.setBoolean(HBASE_CANARY_REGIONSERVER_ALL_REGIONS, true);
+ regionServerAllRegions = true;
} else if(cmd.equals("-writeSniffing")) {
- this.writeSniffing = true;
- } else if(cmd.equals("-treatFailureAsError")) {
- this.treatFailureAsError = true;
+ writeSniffing = true;
+ conf.setBoolean(HBASE_CANARY_REGION_WRITE_SNIFFING, true);
+ } else if(cmd.equals("-treatFailureAsError") || cmd.equals("-failureAsError")) {
+ conf.setBoolean(HBASE_CANARY_FAIL_ON_ERROR, true);
} else if (cmd.equals("-e")) {
- this.useRegExp = true;
+ conf.setBoolean(HBASE_CANARY_USE_REGEX, true);
} else if (cmd.equals("-t")) {
i++;
if (i == args.length) {
- System.err.println("-t needs a numeric value argument.");
+ System.err.println("-t takes a numeric milliseconds value argument.");
printUsageAndExit();
}
-
+ long timeout = 0;
try {
- this.timeout = Long.parseLong(args[i]);
+ timeout = Long.parseLong(args[i]);
} catch (NumberFormatException e) {
- System.err.println("-t needs a numeric value argument.");
+ System.err.println("-t takes a numeric milliseconds value argument.");
printUsageAndExit();
}
+ conf.setLong(HBASE_CANARY_TIMEOUT, timeout);
} else if(cmd.equals("-writeTableTimeout")) {
i++;
if (i == args.length) {
- System.err.println("-writeTableTimeout needs a numeric value argument.");
+ System.err.println("-writeTableTimeout takes a numeric milliseconds value argument.");
printUsageAndExit();
}
-
+ long configuredWriteTableTimeout = 0;
try {
- this.configuredWriteTableTimeout = Long.parseLong(args[i]);
+ configuredWriteTableTimeout = Long.parseLong(args[i]);
} catch (NumberFormatException e) {
- System.err.println("-writeTableTimeout needs a numeric value argument.");
+ System.err.println("-writeTableTimeout takes a numeric milliseconds value argument.");
printUsageAndExit();
}
+ conf.setLong(HBASE_CANARY_REGION_WRITE_TABLE_TIMEOUT, configuredWriteTableTimeout);
} else if (cmd.equals("-writeTable")) {
i++;
if (i == args.length) {
- System.err.println("-writeTable needs a string value argument.");
+ System.err.println("-writeTable takes a string tablename value argument.");
printUsageAndExit();
}
- this.writeTableName = TableName.valueOf(args[i]);
+ conf.set(HBASE_CANARY_REGION_WRITE_TABLE_NAME, args[i]);
} else if (cmd.equals("-f")) {
i++;
@@ -743,30 +827,17 @@ public final class Canary implements Tool {
printUsageAndExit();
}
- this.failOnError = Boolean.parseBoolean(args[i]);
+ conf.setBoolean(HBASE_CANARY_FAIL_ON_ERROR, Boolean.parseBoolean(args[i]));
} else if (cmd.equals("-readTableTimeouts")) {
i++;
if (i == args.length) {
- System.err.println("-readTableTimeouts needs a comma-separated list of read timeouts per table (without spaces).");
+ System.err.println("-readTableTimeouts needs a comma-separated list of read " +
+ "millisecond timeouts per table (without spaces).");
printUsageAndExit();
}
- String [] tableTimeouts = args[i].split(",");
- for (String tT: tableTimeouts) {
- String [] nameTimeout = tT.split("=");
- if (nameTimeout.length < 2) {
- System.err.println("Each -readTableTimeouts argument must be of the form <tableName>=<read timeout>.");
- printUsageAndExit();
- }
- long timeoutVal = 0L;
- try {
- timeoutVal = Long.parseLong(nameTimeout[1]);
- } catch (NumberFormatException e) {
- System.err.println("-readTableTimeouts read timeout for each table must be a numeric value argument.");
- printUsageAndExit();
- }
- this.configuredReadTableTimeouts.put(nameTimeout[0], timeoutVal);
- }
+ readTableTimeoutsStr = args[i];
+ conf.set(HBASE_CANARY_REGION_READ_TABLE_TIMEOUT, readTableTimeoutsStr);
} else if (cmd.equals("-permittedZookeeperFailures")) {
i++;
@@ -775,11 +846,12 @@ public final class Canary implements Tool {
printUsageAndExit();
}
try {
- this.permittedFailures = Long.parseLong(args[i]);
+ permittedFailures = Long.parseLong(args[i]);
} catch (NumberFormatException e) {
System.err.println("-permittedZookeeperFailures needs a numeric value argument.");
printUsageAndExit();
}
+ conf.setLong(HBASE_CANARY_ZOOKEEPER_PERMITTED_FAILURES, permittedFailures);
} else {
// no options match
System.err.println(cmd + " options is invalid.");
@@ -790,22 +862,22 @@ public final class Canary implements Tool {
index = i;
}
}
- if (this.regionServerAllRegions && !this.regionServerMode) {
+ if (regionServerAllRegions && !this.regionServerMode) {
System.err.println("-allRegions can only be specified in regionserver mode.");
printUsageAndExit();
}
if (this.zookeeperMode) {
- if (this.regionServerMode || this.regionServerAllRegions || this.writeSniffing) {
+ if (this.regionServerMode || regionServerAllRegions || writeSniffing) {
System.err.println("-zookeeper is exclusive and cannot be combined with "
+ "other modes.");
printUsageAndExit();
}
}
- if (this.permittedFailures != 0 && !this.zookeeperMode) {
+ if (permittedFailures != 0 && !this.zookeeperMode) {
System.err.println("-permittedZookeeperFailures requires -zookeeper mode.");
printUsageAndExit();
}
- if (!this.configuredReadTableTimeouts.isEmpty() && (this.regionServerMode || this.zookeeperMode)) {
+ if (readTableTimeoutsStr != null && (this.regionServerMode || this.zookeeperMode)) {
System.err.println("-readTableTimeouts can only be configured in region mode.");
printUsageAndExit();
}
@@ -815,6 +887,24 @@ public final class Canary implements Tool {
@Override
public int run(String[] args) throws Exception {
int index = parseArgs(args);
+ String[] monitorTargets = null;
+
+ if (index >= 0) {
+ int length = args.length - index;
+ monitorTargets = new String[length];
+ System.arraycopy(args, index, monitorTargets, 0, length);
+ }
+
+ if (zookeeperMode) {
+ return checkZooKeeper();
+ } else if (regionServerMode) {
+ return checkRegionServers(monitorTargets);
+ } else {
+ return checkRegions(monitorTargets);
+ }
+ }
+
+ private int runMonitor(String[] monitorTargets) throws Exception {
ChoreService choreService = null;
// Launches chore for refreshing kerberos credentials if security is enabled.
@@ -828,15 +918,17 @@ public final class Canary implements Tool {
// Start to prepare the stuffs
Monitor monitor = null;
- Thread monitorThread = null;
+ Thread monitorThread;
long startTime = 0;
long currentTimeLength = 0;
+ boolean failOnError = conf.getBoolean(HBASE_CANARY_FAIL_ON_ERROR, true);
+ long timeout = conf.getLong(HBASE_CANARY_TIMEOUT, DEFAULT_TIMEOUT);
// Get a connection to use in below.
try (Connection connection = ConnectionFactory.createConnection(this.conf)) {
do {
// Do monitor !!
try {
- monitor = this.newMonitor(connection, index, args);
+ monitor = this.newMonitor(connection, monitorTargets);
monitorThread = new Thread(monitor, "CanaryMonitor-" + System.currentTimeMillis());
startTime = System.currentTimeMillis();
monitorThread.start();
@@ -844,7 +936,7 @@ public final class Canary implements Tool {
// wait for 1 sec
Thread.sleep(1000);
// exit if any error occurs
- if (this.failOnError && monitor.hasError()) {
+ if (failOnError && monitor.hasError()) {
monitorThread.interrupt();
if (monitor.initialized) {
return monitor.errorCode;
@@ -853,9 +945,9 @@ public final class Canary implements Tool {
}
}
currentTimeLength = System.currentTimeMillis() - startTime;
- if (currentTimeLength > this.timeout) {
+ if (currentTimeLength > timeout) {
LOG.error("The monitor is running too long (" + currentTimeLength
- + ") after timeout limit:" + this.timeout
+ + ") after timeout limit:" + timeout
+ " will be killed itself !!");
if (monitor.initialized) {
return TIMEOUT_ERROR_EXIT_CODE;
@@ -865,7 +957,7 @@ public final class Canary implements Tool {
}
}
- if (this.failOnError && monitor.finalCheckForErrors()) {
+ if (failOnError && monitor.finalCheckForErrors()) {
monitorThread.interrupt();
return monitor.errorCode;
}
@@ -883,49 +975,65 @@ public final class Canary implements Tool {
return monitor.errorCode;
}
+ @Override
public Map<String, String> getReadFailures() {
return sink.getReadFailures();
}
+ @Override
public Map<String, String> getWriteFailures() {
return sink.getWriteFailures();
}
private void printUsageAndExit() {
- System.err.printf(
- "Usage: bin/hbase %s [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..]%n",
- getClass().getName());
- System.err.println(" where [opts] are:");
- System.err.println(" -help Show this help and exit.");
- System.err.println(" -regionserver replace the table argument to regionserver,");
- System.err.println(" which means to enable regionserver mode");
- System.err.println(" -allRegions Tries all regions on a regionserver,");
- System.err.println(" only works in regionserver mode.");
- System.err.println(" -zookeeper Tries to grab zookeeper.znode.parent ");
- System.err.println(" on each zookeeper instance");
- System.err.println(" -permittedZookeeperFailures <N> Ignore first N failures when attempting to ");
- System.err.println(" connect to individual zookeeper nodes in the ensemble");
- System.err.println(" -daemon Continuous check at defined intervals.");
- System.err.println(" -interval <N> Interval between checks (sec)");
- System.err.println(" -e Use table/regionserver as regular expression");
- System.err.println(" which means the table/regionserver is regular expression pattern");
- System.err.println(" -f <B> stop whole program if first error occurs," +
- " default is true");
- System.err.println(" -t <N> timeout for a check, default is 600000 (millisecs)");
- System.err.println(" -writeTableTimeout <N> write timeout for the writeTable, default is 600000 (millisecs)");
- System.err.println(" -readTableTimeouts <tableName>=<read timeout>,<tableName>=<read timeout>, ... "
- + "comma-separated list of read timeouts per table (no spaces), default is 600000 (millisecs)");
- System.err.println(" -writeSniffing enable the write sniffing in canary");
- System.err.println(" -treatFailureAsError treats read / write failure as error");
- System.err.println(" -writeTable The table used for write sniffing."
- + " Default is hbase:canary");
- System.err.println(" -Dhbase.canary.read.raw.enabled=<true/false> Use this flag to enable or disable raw scan during read canary test"
- + " Default is false and raw is not enabled during scan");
- System.err
- .println(" -D<configProperty>=<value> assigning or override the configuration params");
+ System.err.println(
+ "Usage: canary [OPTIONS] [<TABLE1> [<TABLE2]...] | [<REGIONSERVER1> [<REGIONSERVER2]..]");
+ System.err.println("Where [OPTIONS] are:");
+ System.err.println(" -h,-help show this help and exit.");
+ System.err.println(" -regionserver set 'regionserver mode'; gets row from random region on " +
+ "server");
+ System.err.println(" -allRegions get from ALL regions when 'regionserver mode', not just " +
+ "random one.");
+ System.err.println(" -zookeeper set 'zookeeper mode'; grab zookeeper.znode.parent on " +
+ "each ensemble member");
+ System.err.println(" -daemon continuous check at defined intervals.");
+ System.err.println(" -interval <N> interval between checks in seconds");
+ System.err.println(" -e consider table/regionserver argument as regular " +
+ "expression");
+ System.err.println(" -f <B> exit on first error; default=true");
+ System.err.println(" -failureAsError treat read/write failure as error");
+ System.err.println(" -t <N> timeout for canary-test run; default=600000ms");
+ System.err.println(" -writeSniffing enable write sniffing");
+ System.err.println(" -writeTable the table used for write sniffing; default=hbase:canary");
+ System.err.println(" -writeTableTimeout <N> timeout for writeTable; default=600000ms");
+ System.err.println(" -readTableTimeouts <tableName>=<read timeout>," +
+ "<tableName>=<read timeout>,...");
+ System.err.println(" comma-separated list of table read timeouts " +
+ "(no spaces);");
+ System.err.println(" logs 'ERROR' if takes longer. default=600000ms");
+ System.err.println(" -permittedZookeeperFailures <N> Ignore first N failures attempting to ");
+ System.err.println(" connect to individual zookeeper nodes in ensemble");
+ System.err.println("");
+ System.err.println(" -D<configProperty>=<value> to assign or override configuration params");
+ System.err.println(" -Dhbase.canary.read.raw.enabled=<true/false> Set to enable/disable " +
+ "raw scan; default=false");
+ System.err.println("");
+ System.err.println("Canary runs in one of three modes: region (default), regionserver, or " +
+ "zookeeper.");
+ System.err.println("To sniff/probe all regions, pass no arguments.");
+ System.err.println("To sniff/probe all regions of a table, pass tablename.");
+ System.err.println("To sniff/probe regionservers, pass -regionserver, etc.");
+ System.err.println("See http://hbase.apache.org/book.html#_canary for Canary documentation.");
System.exit(USAGE_EXIT_CODE);
}
+ Sink getSink(Configuration configuration, Class clazz) {
+ // In test context, this.sink might be set. Use it if non-null. For testing.
+ return this.sink != null? this.sink:
+ (Sink)ReflectionUtils.newInstance(configuration.getClass("hbase.canary.sink.class",
+ clazz, Sink.class));
+ }
+
/**
* Canary region mode-specific data structure which stores information about each region
* to be scanned
@@ -1018,46 +1126,76 @@ public final class Canary implements Tool {
/**
* A Factory method for {@link Monitor}.
- * Can be overridden by user.
- * @param index a start index for monitor target
- * @param args args passed from user
+ * Makes a RegionServerMonitor, or a ZooKeeperMonitor, or a RegionMonitor.
* @return a Monitor instance
*/
- public Monitor newMonitor(final Connection connection, int index, String[] args) {
- Monitor monitor = null;
- String[] monitorTargets = null;
-
- if(index >= 0) {
- int length = args.length - index;
- monitorTargets = new String[length];
- System.arraycopy(args, index, monitorTargets, 0, length);
- }
-
- if (this.sink instanceof RegionServerStdOutSink || this.regionServerMode) {
+ private Monitor newMonitor(final Connection connection, String[] monitorTargets) {
+ Monitor monitor;
+ boolean useRegExp = conf.getBoolean(HBASE_CANARY_USE_REGEX, false);
+ boolean regionServerAllRegions
+ = conf.getBoolean(HBASE_CANARY_REGIONSERVER_ALL_REGIONS, false);
+ boolean failOnError
+ = conf.getBoolean(HBASE_CANARY_FAIL_ON_ERROR, true);
+ int permittedFailures
+ = conf.getInt(HBASE_CANARY_ZOOKEEPER_PERMITTED_FAILURES, 0);
+ boolean writeSniffing
+ = conf.getBoolean(HBASE_CANARY_REGION_WRITE_SNIFFING, false);
+ String writeTableName = conf.get(HBASE_CANARY_REGION_WRITE_TABLE_NAME,
+ DEFAULT_WRITE_TABLE_NAME.getNameAsString());
+ long configuredWriteTableTimeout
+ = conf.getLong(HBASE_CANARY_REGION_WRITE_TABLE_TIMEOUT, DEFAULT_TIMEOUT);
+
+ if (this.regionServerMode) {
monitor =
- new RegionServerMonitor(connection, monitorTargets, this.useRegExp,
- (StdOutSink) this.sink, this.executor, this.regionServerAllRegions,
- this.treatFailureAsError, this.permittedFailures);
- } else if (this.sink instanceof ZookeeperStdOutSink || this.zookeeperMode) {
+ new RegionServerMonitor(connection, monitorTargets, useRegExp,
+ getSink(connection.getConfiguration(), RegionServerStdOutSink.class),
+ this.executor, regionServerAllRegions,
+ failOnError, permittedFailures);
+
+ } else if (this.zookeeperMode) {
monitor =
- new ZookeeperMonitor(connection, monitorTargets, this.useRegExp,
- (StdOutSink) this.sink, this.executor, this.treatFailureAsError,
- this.permittedFailures);
+ new ZookeeperMonitor(connection, monitorTargets, useRegExp,
+ getSink(connection.getConfiguration(), ZookeeperStdOutSink.class),
+ this.executor, failOnError, permittedFailures);
} else {
monitor =
- new RegionMonitor(connection, monitorTargets, this.useRegExp,
- (StdOutSink) this.sink, this.executor, this.writeSniffing,
- this.writeTableName, this.treatFailureAsError, this.configuredReadTableTimeouts,
- this.configuredWriteTableTimeout, this.permittedFailures);
+ new RegionMonitor(connection, monitorTargets, useRegExp,
+ getSink(connection.getConfiguration(), RegionStdOutSink.class),
+ this.executor, writeSniffing,
+ TableName.valueOf(writeTableName), failOnError, configuredReadTableTimeouts,
+ configuredWriteTableTimeout, permittedFailures);
}
return monitor;
}
- // a Monitor super-class can be extended by users
+ private void populateReadTableTimeoutsMap(String configuredReadTableTimeoutsStr) {
+ String[] tableTimeouts = configuredReadTableTimeoutsStr.split(",");
+ for (String tT : tableTimeouts) {
+ String[] nameTimeout = tT.split("=");
+ if (nameTimeout.length < 2) {
+ throw new IllegalArgumentException("Each -readTableTimeouts argument must be of the form " +
+ "<tableName>=<read timeout> (without spaces).");
+ }
+ long timeoutVal;
+ try {
+ timeoutVal = Long.parseLong(nameTimeout[1]);
+ } catch (NumberFormatException e) {
+ throw new IllegalArgumentException("-readTableTimeouts read timeout for each table" +
+ " must be a numeric value argument.");
+ }
+ configuredReadTableTimeouts.put(nameTimeout[0], timeoutVal);
+ }
+ }
+ /**
+ * A Monitor super-class can be extended by users
+ */
public static abstract class Monitor implements Runnable, Closeable {
-
protected Connection connection;
protected Admin admin;
+ /**
+ * 'Target' dependent on 'mode'. Could be Tables or RegionServers or ZNodes.
+ * Passed on the command-line as arguments.
+ */
protected String[] targets;
protected boolean useRegExp;
protected boolean treatFailureAsError;
@@ -1127,7 +1265,9 @@ public final class Canary implements Tool {
}
}
- // a monitor for region mode
+ /**
+ * A monitor for region mode.
+ */
private static class RegionMonitor extends Monitor {
// 10 minutes
private static final int DEFAULT_WRITE_TABLE_CHECK_PERIOD = 10 * 60 * 1000;
@@ -1142,14 +1282,22 @@ public final class Canary implements Tool {
private float regionsUpperLimit;
private int checkPeriod;
private boolean rawScanEnabled;
+
+ /**
+ * This is a timeout per table. If read of each region in the table aggregated takes longer
+ * than what is configured here, we log an ERROR rather than just an INFO.
+ */
private HashMap<String, Long> configuredReadTableTimeouts;
+
private long configuredWriteTableTimeout;
public RegionMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
- StdOutSink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName,
+ Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName,
boolean treatFailureAsError, HashMap<String, Long> configuredReadTableTimeouts,
- long configuredWriteTableTimeout, long allowedFailures) {
- super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures);
+ long configuredWriteTableTimeout,
+ long allowedFailures) {
+ super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError,
+ allowedFailures);
Configuration conf = connection.getConfiguration();
this.writeSniffing = writeSniffing;
this.writeTableName = writeTableName;
@@ -1182,16 +1330,19 @@ public final class Canary implements Tool {
RegionStdOutSink regionSink = this.getSink();
if (this.targets != null && this.targets.length > 0) {
String[] tables = generateMonitorTables(this.targets);
- // Check to see that each table name passed in the -readTableTimeouts argument is also passed as a monitor target.
- if (! new HashSet<>(Arrays.asList(tables)).containsAll(this.configuredReadTableTimeouts.keySet())) {
- LOG.error("-readTableTimeouts can only specify read timeouts for monitor targets passed via command line.");
+ // Check to see that each table name passed in the -readTableTimeouts argument is also
+ // passed as a monitor target.
+ if (!new HashSet<>(Arrays.asList(tables)).
+ containsAll(this.configuredReadTableTimeouts.keySet())) {
+ LOG.error("-readTableTimeouts can only specify read timeouts for monitor targets " +
+ "passed via command line.");
this.errorCode = USAGE_EXIT_CODE;
return;
}
this.initialized = true;
for (String table : tables) {
AtomicLong readLatency = regionSink.initializeAndGetReadLatencyForTable(table);
- taskFutures.addAll(Canary.sniff(admin, regionSink, table, executor, TaskType.READ,
+ taskFutures.addAll(CanaryTool.sniff(admin, regionSink, table, executor, TaskType.READ,
this.rawScanEnabled, readLatency));
}
} else {
@@ -1210,7 +1361,7 @@ public final class Canary implements Tool {
// sniff canary table with write operation
regionSink.initializeWriteLatency();
AtomicLong writeTableLatency = regionSink.getWriteLatency();
- taskFutures.addAll(Canary.sniff(admin, regionSink, admin.getTableDescriptor(writeTableName),
+ taskFutures.addAll(CanaryTool.sniff(admin, regionSink, admin.getTableDescriptor(writeTableName),
executor, TaskType.WRITE, this.rawScanEnabled, writeTableLatency));
}
@@ -1227,23 +1378,26 @@ public final class Canary implements Tool {
if (actualReadTableLatency.containsKey(tableName)) {
Long actual = actualReadTableLatency.get(tableName).longValue();
Long configured = entry.getValue();
- LOG.info("Read operation for " + tableName + " took " + actual +
- " ms. The configured read timeout was " + configured + " ms.");
if (actual > configured) {
- LOG.error("Read operation for " + tableName + " exceeded the configured read timeout.");
+ LOG.error("Read operation for {} took {}ms exceeded the configured read timeout." +
+ "(Configured read timeout {}ms.", tableName, actual, configured);
+ } else {
+ LOG.info("Read operation for {} took {}ms (Configured read timeout {}ms.",
+ tableName, actual, configured);
}
} else {
- LOG.error("Read operation for " + tableName + " failed!");
+ LOG.error("Read operation for {} failed!", tableName);
}
}
if (this.writeSniffing) {
String writeTableStringName = this.writeTableName.getNameAsString();
long actualWriteLatency = regionSink.getWriteLatency().longValue();
- LOG.info("Write operation for " + writeTableStringName + " took " + actualWriteLatency + " ms. The configured write timeout was " +
- this.configuredWriteTableTimeout + " ms.");
+ LOG.info("Write operation for {} took {}ms. Configured write timeout {}ms.",
+ writeTableStringName, actualWriteLatency, this.configuredWriteTableTimeout);
// Check that the writeTable write operation latency does not exceed the configured timeout.
if (actualWriteLatency > this.configuredWriteTableTimeout) {
- LOG.error("Write operation for " + writeTableStringName + " exceeded the configured write timeout.");
+ LOG.error("Write operation for {} exceeded the configured write timeout.",
+ writeTableStringName);
}
}
} catch (Exception e) {
@@ -1251,31 +1405,32 @@ public final class Canary implements Tool {
this.errorCode = ERROR_EXIT_CODE;
} finally {
this.done = true;
- }
+ }
}
this.done = true;
}
+ /**
+ * @return List of tables to use in test.
+ */
private String[] generateMonitorTables(String[] monitorTargets) throws IOException {
String[] returnTables = null;
if (this.useRegExp) {
Pattern pattern = null;
HTableDescriptor[] tds = null;
- Set<String> tmpTables = new TreeSet<String>();
+ Set<String> tmpTables = new TreeSet<>();
try {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading list of tables"));
- }
+ LOG.debug(String.format("reading list of tables"));
tds = this.admin.listTables(pattern);
if (tds == null) {
- tds = new HTableDescriptor[0];
+ tds = new HTableDescriptor[] {};
}
for (String monitorTarget : monitorTargets) {
pattern = Pattern.compile(monitorTarget);
for (HTableDescriptor td : tds) {
- if (pattern.matcher(td.getNameAsString()).matches()) {
- tmpTables.add(td.getNameAsString());
+ if (pattern.matcher(td.getTableName().getNameAsString()).matches()) {
+ tmpTables.add(td.getTableName().getNameAsString());
}
}
}
@@ -1300,18 +1455,19 @@ public final class Canary implements Tool {
}
/*
- * canary entry point to monitor all the tables.
+ * Canary entry point to monitor all the tables.
*/
- private List<Future<Void>> sniff(TaskType taskType, RegionStdOutSink regionSink) throws Exception {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading list of tables"));
- }
+ private List<Future<Void>> sniff(TaskType taskType, RegionStdOutSink regionSink)
+ throws Exception {
+ LOG.debug("Reading list of tables");
List<Future<Void>> taskFutures = new LinkedList<>();
- for (HTableDescriptor table : admin.listTables()) {
- if (admin.tableExists(table.getTableName()) && admin.isTableEnabled(table.getTableName())
- && (!table.getTableName().equals(writeTableName))) {
- AtomicLong readLatency = regionSink.initializeAndGetReadLatencyForTable(table.getNameAsString());
- taskFutures.addAll(Canary.sniff(admin, sink, table, executor, taskType, this.rawScanEnabled, readLatency));
+ for (HTableDescriptor td: admin.listTables()) {
+ if (admin.tableExists(td.getTableName()) && admin.isTableEnabled(td.getTableName()) &&
+ (!td.getTableName().equals(writeTableName))) {
+ AtomicLong readLatency =
+ regionSink.initializeAndGetReadLatencyForTable(td.getTableName().getNameAsString());
+ taskFutures.addAll(CanaryTool.sniff(admin, sink, td, executor, taskType, this.rawScanEnabled,
+ readLatency));
}
}
return taskFutures;
@@ -1319,7 +1475,8 @@ public final class Canary implements Tool {
private void checkWriteTableDistribution() throws IOException {
if (!admin.tableExists(writeTableName)) {
- int numberOfServers = admin.getClusterStatus().getServers().size();
+ int numberOfServers =
+ admin.getClusterStatus().getServersSize();
if (numberOfServers == 0) {
throw new IllegalStateException("No live regionservers");
}
@@ -1330,7 +1487,8 @@ public final class Canary implements Tool {
admin.enableTable(writeTableName);
}
- ClusterStatus status = admin.getClusterStatus();
+ ClusterStatus status =
+ admin.getClusterStatus();
int numberOfServers = status.getServersSize();
if (status.getServers().contains(status.getMaster())) {
numberOfServers -= 1;
@@ -1362,11 +1520,10 @@ public final class Canary implements Tool {
private void createWriteTable(int numberOfServers) throws IOException {
int numberOfRegions = (int)(numberOfServers * regionsLowerLimit);
- LOG.info("Number of live regionservers: " + numberOfServers + ", "
- + "pre-splitting the canary table into " + numberOfRegions + " regions "
- + "(current lower limit of regions per server is " + regionsLowerLimit
- + " and you can change it by config: "
- + HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY + " )");
+ LOG.info("Number of live regionservers {}, pre-splitting the canary table into {} regions " +
+ "(current lower limit of regions per server is {} and you can change it with config {}).",
+ numberOfServers, numberOfRegions, regionsLowerLimit,
+ HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY);
HTableDescriptor desc = new HTableDescriptor(writeTableName);
HColumnDescriptor family = new HColumnDescriptor(CANARY_TABLE_FAMILY_NAME);
family.setMaxVersions(1);
@@ -1383,61 +1540,43 @@ public final class Canary implements Tool {
* @throws Exception
*/
private static List<Future<Void>> sniff(final Admin admin, final Sink sink, String tableName,
- ExecutorService executor, TaskType taskType, boolean rawScanEnabled, AtomicLong readLatency) throws Exception {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("checking table is enabled and getting table descriptor for table %s",
- tableName));
- }
+ ExecutorService executor, TaskType taskType, boolean rawScanEnabled, AtomicLong readLatency)
+ throws Exception {
+ LOG.debug("Checking table is enabled and getting table descriptor for table {}", tableName);
if (admin.isTableEnabled(TableName.valueOf(tableName))) {
- return Canary.sniff(admin, sink, admin.getTableDescriptor(TableName.valueOf(tableName)),
+ return CanaryTool.sniff(admin, sink, admin.getTableDescriptor(TableName.valueOf(tableName)),
executor, taskType, rawScanEnabled, readLatency);
} else {
- LOG.warn(String.format("Table %s is not enabled", tableName));
+ LOG.warn("Table {} is not enabled", tableName);
}
- return new LinkedList<Future<Void>>();
+ return new LinkedList<>();
}
/*
- * Loops over regions that owns this table, and output some information about the state.
+ * Loops over regions of this table, and outputs information about the state.
*/
private static List<Future<Void>> sniff(final Admin admin, final Sink sink,
HTableDescriptor tableDesc, ExecutorService executor, TaskType taskType,
- boolean rawScanEnabled, AtomicLong rwLatency) throws Exception {
-
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading list of regions for table %s", tableDesc.getTableName()));
- }
-
- Table table = null;
- try {
- table = admin.getConnection().getTable(tableDesc.getTableName());
- } catch (TableNotFoundException e) {
- return new ArrayList<Future<Void>>();
- } finally {
- if (table != null) {
- table.close();
- }
- }
-
- List<RegionTask> tasks = new ArrayList<RegionTask>();
- RegionLocator regionLocator = null;
- try {
- regionLocator = admin.getConnection().getRegionLocator(tableDesc.getTableName());
- for (HRegionLocation location : regionLocator.getAllRegionLocations()) {
- ServerName rs = location.getServerName();
- HRegionInfo region = location.getRegionInfo();
- tasks.add(new RegionTask(admin.getConnection(), region, rs, (RegionStdOutSink) sink, taskType, rawScanEnabled,
- rwLatency));
- Map<String, RegionTaskResult> regionMap = ((RegionStdOutSink) sink).getRegionMap();
- regionMap.put(region.getRegionNameAsString(), new RegionTaskResult(region,
- region.getTable(), rs));
- }
- } finally {
- if (regionLocator != null) {
- regionLocator.close();
+ boolean rawScanEnabled, AtomicLong rwLatency) throws Exception {
+ LOG.debug("Reading list of regions for table {}", tableDesc.getTableName());
+ try (Table table = admin.getConnection().getTable(tableDesc.getTableName())) {
+ List<RegionTask> tasks = new ArrayList<>();
+ try (RegionLocator regionLocator =
+ admin.getConnection().getRegionLocator(tableDesc.getTableName())) {
+ for (HRegionLocation location: regionLocator.getAllRegionLocations()) {
+ ServerName rs = location.getServerName();
+ HRegionInfo region = location.getRegionInfo();
+ tasks.add(new RegionTask(admin.getConnection(), region, rs, (RegionStdOutSink)sink,
+ taskType, rawScanEnabled, rwLatency));
+ Map<String, RegionTaskResult> regionMap = ((RegionStdOutSink) sink).getRegionMap();
+ regionMap.put(region.getRegionNameAsString(), new RegionTaskResult(region,
+ region.getTable(), rs));
+ }
+ return executor.invokeAll(tasks);
}
+ } catch (TableNotFoundException e) {
+ return Collections.EMPTY_LIST;
}
- return executor.invokeAll(tasks);
}
// monitor for zookeeper mode
@@ -1447,8 +1586,9 @@ public final class Canary implements Tool {
private final int timeout;
protected ZookeeperMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
- StdOutSink sink, ExecutorService executor, boolean treatFailureAsError, long allowedFailures) {
- super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures);
+ Sink sink, ExecutorService executor, boolean treatFailureAsError, long allowedFailures) {
+ super(connection, monitorTargets, useRegExp,
+ sink, executor, treatFailureAsError, allowedFailures);
Configuration configuration = connection.getConfiguration();
znode =
configuration.get(ZOOKEEPER_ZNODE_PARENT,
@@ -1462,9 +1602,9 @@ public final class Canary implements Tool {
hosts.add(server.toString());
}
if (allowedFailures > (hosts.size() - 1) / 2) {
- LOG.warn(String.format("Confirm allowable number of failed ZooKeeper nodes, as quorum will " +
- "already be lost. Setting of %d failures is unexpected for %d ensemble size.",
- allowedFailures, hosts.size()));
+ LOG.warn("Confirm allowable number of failed ZooKeeper nodes, as quorum will " +
+ "already be lost. Setting of {} failures is unexpected for {} ensemble size.",
+ allowedFailures, hosts.size());
}
}
@@ -1507,15 +1647,17 @@ public final class Canary implements Tool {
}
- // a monitor for regionserver mode
+ /**
+ * A monitor for regionserver mode
+ */
private static class RegionServerMonitor extends Monitor {
-
private boolean allRegions;
public RegionServerMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
- StdOutSink sink, ExecutorService executor, boolean allRegions,
+ Sink sink, ExecutorService executor, boolean allRegions,
boolean treatFailureAsError, long allowedFailures) {
- super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures);
+ super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError,
+ allowedFailures);
this.allRegions = allRegions;
}
@@ -1544,12 +1686,9 @@ public final class Canary implements Tool {
}
private boolean checkNoTableNames() {
- List<String> foundTableNames = new ArrayList<String>();
+ List<String> foundTableNames = new ArrayList<>();
TableName[] tableNames = null;
-
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading list of tables"));
- }
+ LOG.debug("Reading list of tables");
try {
tableNames = this.admin.listTableNames();
} catch (IOException e) {
@@ -1573,11 +1712,10 @@ public final class Canary implements Tool {
"option, tablenames:" + foundTableNames.toString());
this.errorCode = USAGE_EXIT_CODE;
}
- return foundTableNames.size() == 0;
+ return foundTableNames.isEmpty();
}
- private void monitorRegionServers(Map<String, List<HRegionInfo>> rsAndRMap,
- RegionServerStdOutSink regionServerSink) {
+ private void monitorRegionServers(Map<String, List<HRegionInfo>> rsAndRMap, RegionServerStdOutSink regionServerSink) {
List<RegionServerTask> tasks = new ArrayList<>();
Map<String, AtomicLong> successMap = new HashMap<>();
Random rand = new Random();
@@ -1586,7 +1724,7 @@ public final class Canary implements Tool {
AtomicLong successes = new AtomicLong(0);
successMap.put(serverName, successes);
if (entry.getValue().isEmpty()) {
- LOG.error(String.format("Regionserver not serving any regions - %s", serverName));
+ LOG.error("Regionserver not serving any regions - {}", serverName);
} else if (this.allRegions) {
for (HRegionInfo region : entry.getValue()) {
tasks.add(new RegionServerTask(this.connection,
@@ -1617,8 +1755,8 @@ public final class Canary implements Tool {
if (this.allRegions) {
for (Map.Entry<String, List<HRegionInfo>> entry : rsAndRMap.entrySet()) {
String serverName = entry.getKey();
- LOG.info("Successfully read " + successMap.get(serverName) + " regions out of "
- + entry.getValue().size() + " on regionserver:" + serverName);
+ LOG.info("Successfully read {} regions out of {} on regionserver {}",
+ successMap.get(serverName), entry.getValue().size(), serverName);
}
}
} catch (InterruptedException e) {
@@ -1634,56 +1772,41 @@ public final class Canary implements Tool {
}
private Map<String, List<HRegionInfo>> getAllRegionServerByName() {
- Map<String, List<HRegionInfo>> rsAndRMap = new HashMap<String, List<HRegionInfo>>();
- Table table = null;
- RegionLocator regionLocator = null;
+ Map<String, List<HRegionInfo>> rsAndRMap = new HashMap<>();
try {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("reading list of tables and locations"));
- }
+ LOG.debug("Reading list of tables and locations");
HTableDescriptor[] tableDescs = this.admin.listTables();
List<HRegionInfo> regions = null;
- for (HTableDescriptor tableDesc : tableDescs) {
- table = this.admin.getConnection().getTable(tableDesc.getTableName());
- regionLocator = this.admin.getConnection().getRegionLocator(tableDesc.getTableName());
-
- for (HRegionLocation location : regionLocator.getAllRegionLocations()) {
- ServerName rs = location.getServerName();
- String rsName = rs.getHostname();
- HRegionInfo r = location.getRegionInfo();
-
- if (rsAndRMap.containsKey(rsName)) {
- regions = rsAndRMap.get(rsName);
- } else {
- regions = new ArrayList<HRegionInfo>();
- rsAndRMap.put(rsName, regions);
+ for (HTableDescriptor tableDesc: tableDescs) {
+ try (RegionLocator regionLocator =
+ this.admin.getConnection().getRegionLocator(tableDesc.getTableName())) {
+ for (HRegionLocation location : regionLocator.getAllRegionLocations()) {
+ ServerName rs = location.getServerName();
+ String rsName = rs.getHostname();
+ HRegionInfo r = location.getRegionInfo();
+ if (rsAndRMap.containsKey(rsName)) {
+ regions = rsAndRMap.get(rsName);
+ } else {
+ regions = new ArrayList<>();
+ rsAndRMap.put(rsName, regions);
+ }
+ regions.add(r);
}
- regions.add(r);
}
- table.close();
}
- //get any live regionservers not serving any regions
- for (ServerName rs : this.admin.getClusterStatus().getServers()) {
+ // get any live regionservers not serving any regions
+ for (ServerName rs: this.admin.getClusterStatus()
+ .getServers()) {
String rsName = rs.getHostname();
if (!rsAndRMap.containsKey(rsName)) {
- rsAndRMap.put(rsName, Collections.<HRegionInfo>emptyList());
+ rsAndRMap.put(rsName, Collections.<HRegionInfo> emptyList());
}
}
} catch (IOException e) {
- String msg = "Get HTables info failed";
- LOG.error(msg, e);
+ LOG.error("Get HTables info failed", e);
this.errorCode = INIT_ERROR_EXIT_CODE;
- } finally {
- if (table != null) {
- try {
- table.close();
- } catch (IOException e) {
- LOG.warn("Close table failed", e);
- }
- }
}
-
return rsAndRMap;
}
@@ -1693,7 +1816,7 @@ public final class Canary implements Tool {
Map<String, List<HRegionInfo>> filteredRsAndRMap = null;
if (this.targets != null && this.targets.length > 0) {
- filteredRsAndRMap = new HashMap<String, List<HRegionInfo>>();
+ filteredRsAndRMap = new HashMap<>();
Pattern pattern = null;
Matcher matcher = null;
boolean regExpFound = false;
@@ -1709,13 +1832,13 @@ public final class Canary implements Tool {
}
}
if (!regExpFound) {
- LOG.info("No RegionServerInfo found, regionServerPattern:" + rsName);
+ LOG.info("No RegionServerInfo found, regionServerPattern {}", rsName);
}
} else {
if (fullRsAndRMap.containsKey(rsName)) {
filteredRsAndRMap.put(rsName, fullRsAndRMap.get(rsName));
} else {
- LOG.info("No RegionServerInfo found, regionServerName:" + rsName);
+ LOG.info("No RegionServerInfo found, regionServerName {}", rsName);
}
}
}
@@ -1729,20 +1852,16 @@ public final class Canary implements Tool {
public static void main(String[] args) throws Exception {
final Configuration conf = HBaseConfiguration.create();
- // loading the generic options to conf
- new GenericOptionsParser(conf, args);
-
int numThreads = conf.getInt("hbase.canary.threads.num", MAX_THREADS_NUM);
- LOG.info("Number of execution threads " + numThreads);
+ LOG.info("Execution thread count={}", numThreads);
+ int exitCode;
ExecutorService executor = new ScheduledThreadPoolExecutor(numThreads);
-
- Class<? extends Sink> sinkClass =
- conf.getClass("hbase.canary.sink.class", RegionServerStdOutSink.class, Sink.class);
- Sink sink = ReflectionUtils.newInstance(sinkClass);
-
- int exitCode = ToolRunner.run(conf, new Canary(executor, sink), args);
- executor.shutdown();
+ try {
+ exitCode = ToolRunner.run(conf, new CanaryTool(executor), args);
+ } finally {
+ executor.shutdown();
+ }
System.exit(exitCode);
}
-}
\ No newline at end of file
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java
index 7c435a8..c07e80f 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java
@@ -113,9 +113,9 @@ public class TestCanaryTool {
table.put(p);
}
ExecutorService executor = new ScheduledThreadPoolExecutor(1);
- Canary.RegionStdOutSink sink = spy(new Canary.RegionStdOutSink());
- Canary canary = new Canary(executor, sink);
- String[] args = { "-writeSniffing", "-t", "10000", "testTable" };
+ CanaryTool.RegionStdOutSink sink = spy(new CanaryTool.RegionStdOutSink());
+ CanaryTool canary = new CanaryTool(executor, sink);
+ String[] args = { "-writeSniffing", "-t", "10000", tableName.getNameAsString() };
assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args));
assertEquals("verify no read error count", 0, canary.getReadFailures().size());
assertEquals("verify no write error count", 0, canary.getWriteFailures().size());
@@ -134,9 +134,9 @@ public class TestCanaryTool {
table.put(p);
}
ExecutorService executor = new ScheduledThreadPoolExecutor(1);
- Canary.RegionStdOutSink sink = spy(new Canary.RegionStdOutSink());
- Canary canary = new Canary(executor, sink);
- String[] args = {"-writeSniffing", "-t", "10000", "testCanaryRegionTaskResult"};
+ CanaryTool.RegionStdOutSink sink = spy(new CanaryTool.RegionStdOutSink());
+ CanaryTool canary = new CanaryTool(executor, sink);
+ String[] args = { "-writeSniffing", "-t", "10000", "testCanaryRegionTaskResult" };
assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args));
assertTrue("verify read success count > 0", sink.getReadSuccessCount() > 0);
@@ -147,12 +147,12 @@ public class TestCanaryTool {
isA(HColumnDescriptor.class), anyLong());
assertTrue("canary should expect to scan at least 1 region",
- sink.getTotalExpectedRegions() > 0);
- Map<String, Canary.RegionTaskResult> regionMap = sink.getRegionMap();
+ sink.getTotalExpectedRegions() > 0);
+ Map<String, CanaryTool.RegionTaskResult> regionMap = sink.getRegionMap();
assertFalse("verify region map has size > 0", regionMap.isEmpty());
for (String regionName : regionMap.keySet()) {
- Canary.RegionTaskResult res = regionMap.get(regionName);
+ CanaryTool.RegionTaskResult res = regionMap.get(regionName);
assertNotNull("verify each expected region has a RegionTaskResult object in the map", res);
assertNotNull("verify getRegionNameAsString()", regionName);
assertNotNull("verify getRegionInfo()", res.getRegionInfo());
@@ -161,7 +161,7 @@ public class TestCanaryTool {
assertNotNull("verify getServerName()", res.getServerName());
assertNotNull("verify getServerNameAsString()", res.getServerNameAsString());
- if (regionName.contains(Canary.DEFAULT_WRITE_TABLE_NAME.getNameAsString())) {
+ if (regionName.contains(CanaryTool.DEFAULT_WRITE_TABLE_NAME.getNameAsString())) {
assertTrue("write to region " + regionName + " succeeded", res.isWriteSuccess());
assertTrue("write took some time", res.getWriteLatency() > -1);
} else {
@@ -189,8 +189,8 @@ public class TestCanaryTool {
}
}
ExecutorService executor = new ScheduledThreadPoolExecutor(1);
- Canary.RegionStdOutSink sink = spy(new Canary.RegionStdOutSink());
- Canary canary = new Canary(executor, sink);
+ CanaryTool.RegionStdOutSink sink = spy(new CanaryTool.RegionStdOutSink());
+ CanaryTool canary = new CanaryTool(executor, sink);
String configuredTimeoutStr = tableNames[0].getNameAsString() + "=" + Long.MAX_VALUE + "," +
tableNames[1].getNameAsString() + "=0";
String[] args = { "-readTableTimeouts", configuredTimeoutStr, tableNames[0].getNameAsString(), tableNames[1].getNameAsString()};
@@ -210,7 +210,7 @@ public class TestCanaryTool {
verify(mockAppender, times(2)).doAppend(argThat(new ArgumentMatcher<LoggingEvent>() {
@Override
public boolean matches(Object argument) {
- return ((LoggingEvent) argument).getRenderedMessage().contains("The configured read timeout was");
+ return ((LoggingEvent) argument).getRenderedMessage().contains("Configured read timeout");
}
}));
}
@@ -219,18 +219,19 @@ public class TestCanaryTool {
@Ignore("Intermittent argument matching failures, see HBASE-18813")
public void testWriteTableTimeout() throws Exception {
ExecutorService executor = new ScheduledThreadPoolExecutor(1);
- Canary.RegionStdOutSink sink = spy(new Canary.RegionStdOutSink());
- Canary canary = new Canary(executor, sink);
+ CanaryTool.RegionStdOutSink sink = spy(new CanaryTool.RegionStdOutSink());
+ CanaryTool canary = new CanaryTool(executor, sink);
String[] args = { "-writeSniffing", "-writeTableTimeout", String.valueOf(Long.MAX_VALUE)};
assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args));
assertNotEquals("verify non-null write latency", null, sink.getWriteLatency());
assertNotEquals("verify non-zero write latency", 0L, sink.getWriteLatency());
- verify(mockAppender, times(1)).doAppend(argThat(new ArgumentMatcher<LoggingEvent>() {
- @Override
- public boolean matches(Object argument) {
- return ((LoggingEvent) argument).getRenderedMessage().contains("The configured write timeout was");
- }
- }));
+ verify(mockAppender, times(1)).doAppend(argThat(
+ new ArgumentMatcher<LoggingEvent>() {
+ @Override
+ public boolean matches(Object argument) {
+ return ((LoggingEvent) argument).getRenderedMessage().contains("Configured write timeout");
+ }
+ }));
}
//no table created, so there should be no regions
@@ -271,10 +272,11 @@ public class TestCanaryTool {
table.put(p);
}
ExecutorService executor = new ScheduledThreadPoolExecutor(1);
- Canary.RegionStdOutSink sink = spy(new Canary.RegionStdOutSink());
- Canary canary = new Canary(executor, sink);
+ CanaryTool.RegionStdOutSink sink = spy(new CanaryTool.RegionStdOutSink());
+ CanaryTool canary = new CanaryTool(executor, sink);
String[] args = { "-t", "10000", "testTableRawScan" };
- org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration(testingUtility.getConfiguration());
+ org.apache.hadoop.conf.Configuration conf =
+ new org.apache.hadoop.conf.Configuration(testingUtility.getConfiguration());
conf.setBoolean(HConstants.HBASE_CANARY_READ_RAW_SCAN_KEY, true);
assertEquals(0, ToolRunner.run(conf, canary, args));
verify(sink, atLeastOnce())
@@ -284,7 +286,7 @@ public class TestCanaryTool {
private void runRegionserverCanary() throws Exception {
ExecutorService executor = new ScheduledThreadPoolExecutor(1);
- Canary canary = new Canary(executor, new Canary.RegionServerStdOutSink());
+ CanaryTool canary = new CanaryTool(executor, new CanaryTool.RegionServerStdOutSink());
String[] args = { "-t", "10000", "-regionserver"};
assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args));
assertEquals("verify no read error count", 0, canary.getReadFailures().size());
@@ -296,8 +298,8 @@ public class TestCanaryTool {
testingUtility.getConfiguration().set(HConstants.ZOOKEEPER_QUORUM,
"localhost:" + port + "/hbase");
ExecutorService executor = new ScheduledThreadPoolExecutor(2);
- Canary.ZookeeperStdOutSink sink = spy(new Canary.ZookeeperStdOutSink());
- Canary canary = new Canary(executor, sink);
+ CanaryTool.ZookeeperStdOutSink sink = spy(new CanaryTool.ZookeeperStdOutSink());
+ CanaryTool canary = new CanaryTool(executor, sink);
assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args));
String baseZnode = testingUtility.getConfiguration()
@@ -305,4 +307,4 @@ public class TestCanaryTool {
verify(sink, atLeastOnce())
.publishReadTiming(eq(baseZnode), eq("localhost:" + port), anyLong());
}
-}
\ No newline at end of file
+}
diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc
index 767ba5a..90b3e01 100644
--- a/src/main/asciidoc/_chapters/ops_mgt.adoc
+++ b/src/main/asciidoc/_chapters/ops_mgt.adoc
@@ -81,7 +81,7 @@ To see the usage, use the `--help` parameter.
----
$ ${HBASE_HOME}/bin/hbase canary -help
-Usage: bin/hbase org.apache.hadoop.hbase.tool.Canary [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..]
+Usage: hbase canary [opts] [table1 [table2]...] | [regionserver1 [regionserver2]..]
where [opts] are:
-help Show this help and exit.
-regionserver replace the table argument to regionserver,