You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by sy...@apache.org on 2016/02/22 23:20:48 UTC
[20/22] hbase git commit: HBASE-15219 Canary tool does not return
non-zero exit code when one of regions is in stuck state
HBASE-15219 Canary tool does not return non-zero exit code when one of regions is in stuck state
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/ed290cf8
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/ed290cf8
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/ed290cf8
Branch: refs/heads/hbase-12439
Commit: ed290cf829e54cbbff740b205d41417768e04337
Parents: a878b19
Author: tedyu <yu...@gmail.com>
Authored: Sat Feb 20 20:25:01 2016 -0800
Committer: tedyu <yu...@gmail.com>
Committed: Sat Feb 20 20:25:01 2016 -0800
----------------------------------------------------------------------
.../org/apache/hadoop/hbase/tool/Canary.java | 66 +++++++++++++++++---
src/main/asciidoc/_chapters/ops_mgt.adoc | 11 ++++
2 files changed, 68 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hbase/blob/ed290cf8/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
index 3c7ae64..9a71a14 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java
@@ -94,9 +94,12 @@ import org.apache.hadoop.util.ToolRunner;
public final class Canary implements Tool {
// Sink interface used by the canary to outputs information
public interface Sink {
+ public long getReadFailureCount();
+ public long incReadFailureCount();
public void publishReadFailure(HRegionInfo region, Exception e);
public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception e);
public void publishReadTiming(HRegionInfo region, HColumnDescriptor column, long msTime);
+ public long getWriteFailureCount();
public void publishWriteFailure(HRegionInfo region, Exception e);
public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception e);
public void publishWriteTiming(HRegionInfo region, HColumnDescriptor column, long msTime);
@@ -111,13 +114,28 @@ public final class Canary implements Tool {
// Simple implementation of canary sink that allows to plot on
// file or standard output timings or failures.
public static class StdOutSink implements Sink {
+ private AtomicLong readFailureCount = new AtomicLong(0),
+ writeFailureCount = new AtomicLong(0);
+
+ @Override
+ public long getReadFailureCount() {
+ return readFailureCount.get();
+ }
+
+ @Override
+ public long incReadFailureCount() {
+ return readFailureCount.incrementAndGet();
+ }
+
@Override
public void publishReadFailure(HRegionInfo region, Exception e) {
+ readFailureCount.incrementAndGet();
LOG.error(String.format("read from region %s failed", region.getRegionNameAsString()), e);
}
@Override
public void publishReadFailure(HRegionInfo region, HColumnDescriptor column, Exception e) {
+ readFailureCount.incrementAndGet();
LOG.error(String.format("read from region %s column family %s failed",
region.getRegionNameAsString(), column.getNameAsString()), e);
}
@@ -129,12 +147,19 @@ public final class Canary implements Tool {
}
@Override
+ public long getWriteFailureCount() {
+ return writeFailureCount.get();
+ }
+
+ @Override
public void publishWriteFailure(HRegionInfo region, Exception e) {
+ writeFailureCount.incrementAndGet();
LOG.error(String.format("write to region %s failed", region.getRegionNameAsString()), e);
}
@Override
public void publishWriteFailure(HRegionInfo region, HColumnDescriptor column, Exception e) {
+ writeFailureCount.incrementAndGet();
LOG.error(String.format("write to region %s column family %s failed",
region.getRegionNameAsString(), column.getNameAsString()), e);
}
@@ -150,6 +175,7 @@ public final class Canary implements Tool {
@Override
public void publishReadFailure(String table, String server) {
+ incReadFailureCount();
LOG.error(String.format("Read from table:%s on region server:%s", table, server));
}
@@ -412,6 +438,7 @@ public final class Canary implements Tool {
private static final int INIT_ERROR_EXIT_CODE = 2;
private static final int TIMEOUT_ERROR_EXIT_CODE = 3;
private static final int ERROR_EXIT_CODE = 4;
+ private static final int FAILURE_EXIT_CODE = 5;
private static final long DEFAULT_INTERVAL = 6000;
@@ -435,6 +462,7 @@ public final class Canary implements Tool {
private boolean regionServerMode = false;
private boolean regionServerAllRegions = false;
private boolean writeSniffing = false;
+ private boolean treatFailureAsError = false;
private TableName writeTableName = DEFAULT_WRITE_TABLE_NAME;
private ExecutorService executor; // threads to retrieve data from regionservers
@@ -498,6 +526,8 @@ public final class Canary implements Tool {
this.regionServerAllRegions = true;
} else if(cmd.equals("-writeSniffing")) {
this.writeSniffing = true;
+ } else if(cmd.equals("-treatFailureAsError")) {
+ this.treatFailureAsError = true;
} else if (cmd.equals("-e")) {
this.useRegExp = true;
} else if (cmd.equals("-t")) {
@@ -602,7 +632,7 @@ public final class Canary implements Tool {
}
}
- if (this.failOnError && monitor.hasError()) {
+ if (this.failOnError && monitor.finalCheckForErrors()) {
monitorThread.interrupt();
return monitor.errorCode;
}
@@ -638,6 +668,7 @@ public final class Canary implements Tool {
" default is true");
System.err.println(" -t <N> timeout for a check, default is 600000 (milisecs)");
System.err.println(" -writeSniffing enable the write sniffing in canary");
+ System.err.println(" -treatFailureAsError treats read / write failure as error");
System.err.println(" -writeTable The table used for write sniffing."
+ " Default is hbase:canary");
System.err
@@ -665,11 +696,12 @@ public final class Canary implements Tool {
if (this.regionServerMode) {
monitor =
new RegionServerMonitor(connection, monitorTargets, this.useRegExp,
- (ExtendedSink) this.sink, this.executor, this.regionServerAllRegions);
+ (ExtendedSink) this.sink, this.executor, this.regionServerAllRegions,
+ this.treatFailureAsError);
} else {
monitor =
new RegionMonitor(connection, monitorTargets, this.useRegExp, this.sink, this.executor,
- this.writeSniffing, this.writeTableName);
+ this.writeSniffing, this.writeTableName, this.treatFailureAsError);
}
return monitor;
}
@@ -681,6 +713,7 @@ public final class Canary implements Tool {
protected Admin admin;
protected String[] targets;
protected boolean useRegExp;
+ protected boolean treatFailureAsError;
protected boolean initialized = false;
protected boolean done = false;
@@ -696,18 +729,31 @@ public final class Canary implements Tool {
return errorCode != 0;
}
+ public boolean finalCheckForErrors() {
+ if (errorCode != 0) {
+ return true;
+ }
+ if (treatFailureAsError &&
+ (sink.getReadFailureCount() > 0 || sink.getWriteFailureCount() > 0)) {
+ errorCode = FAILURE_EXIT_CODE;
+ return true;
+ }
+ return false;
+ }
+
@Override
public void close() throws IOException {
if (this.admin != null) this.admin.close();
}
protected Monitor(Connection connection, String[] monitorTargets, boolean useRegExp, Sink sink,
- ExecutorService executor) {
+ ExecutorService executor, boolean treatFailureAsError) {
if (null == connection) throw new IllegalArgumentException("connection shall not be null");
this.connection = connection;
this.targets = monitorTargets;
this.useRegExp = useRegExp;
+ this.treatFailureAsError = treatFailureAsError;
this.sink = sink;
this.executor = executor;
}
@@ -747,8 +793,9 @@ public final class Canary implements Tool {
private int checkPeriod;
public RegionMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
- Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName) {
- super(connection, monitorTargets, useRegExp, sink, executor);
+ Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName,
+ boolean treatFailureAsError) {
+ super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError);
Configuration conf = connection.getConfiguration();
this.writeSniffing = writeSniffing;
this.writeTableName = writeTableName;
@@ -992,8 +1039,9 @@ public final class Canary implements Tool {
private boolean allRegions;
public RegionServerMonitor(Connection connection, String[] monitorTargets, boolean useRegExp,
- ExtendedSink sink, ExecutorService executor, boolean allRegions) {
- super(connection, monitorTargets, useRegExp, sink, executor);
+ ExtendedSink sink, ExecutorService executor, boolean allRegions,
+ boolean treatFailureAsError) {
+ super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError);
this.allRegions = allRegions;
}
@@ -1088,7 +1136,7 @@ public final class Canary implements Tool {
}
} catch (InterruptedException e) {
this.errorCode = ERROR_EXIT_CODE;
- LOG.error("Sniff regionserver failed!", e);
+ LOG.error("Sniff regionserver interrupted!", e);
}
}
http://git-wip-us.apache.org/repos/asf/hbase/blob/ed290cf8/src/main/asciidoc/_chapters/ops_mgt.adoc
----------------------------------------------------------------------
diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc
index 66d7545..63941dc 100644
--- a/src/main/asciidoc/_chapters/ops_mgt.adoc
+++ b/src/main/asciidoc/_chapters/ops_mgt.adoc
@@ -93,6 +93,7 @@ Usage: bin/hbase org.apache.hadoop.hbase.tool.Canary [opts] [table1 [table2]...]
-f <B> stop whole program if first error occurs, default is true
-t <N> timeout for a check, default is 600000 (milliseconds)
-writeSniffing enable the write sniffing in canary
+ -treatFailureAsError treats read / write failure as error
-writeTable The table used for write sniffing. Default is hbase:canary
-D<configProperty>=<value> assigning or override the configuration params
----
@@ -215,6 +216,16 @@ $ ${HBASE_HOME}/bin/hbase canary -writeSniffing -writeTable ns:canary
The default value size of each put is 10 bytes and you can set it by the config key:
`hbase.canary.write.value.size`.
+==== Treat read / write failure as error
+
+By default, the canary tool only logs read failure, due to e.g. RetriesExhaustedException,
+while returning normal exit code. To treat read / write failure as error, you can run canary
+with the `-treatFailureAsError` option. When enabled, read / write failure would result in error
+exit code.
+----
+$ ${HBASE_HOME}/bin/hbase canary --treatFailureAsError
+----
+
==== Running Canary in a Kerberos-enabled Cluster
To run Canary in a Kerberos-enabled cluster, configure the following two properties in _hbase-site.xml_: