You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by en...@apache.org on 2013/10/29 23:05:58 UTC
svn commit: r1536910 - in /hbase/branches/0.96:
hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/
hbase-common/src/main/java/org/apache/hadoop/hbase/util/
hbase-it/src/test/java/org/apache/hadoop/hbase/
hbase-it/src/test/java/org/apache/had...
Author: enis
Date: Tue Oct 29 22:05:58 2013
New Revision: 1536910
URL: http://svn.apache.org/r1536910
Log:
HBASE-9750 Add retries around Action server stop/start
Modified:
hbase/branches/0.96/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounter.java
hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java
hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRegionsOfTableAction.java
hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
Modified: hbase/branches/0.96/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java?rev=1536910&r1=1536909&r2=1536910&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java (original)
+++ hbase/branches/0.96/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java Tue Oct 29 22:05:58 2013
@@ -111,7 +111,7 @@ public class RecoverableZooKeeper {
// TODO: Add support for zk 'chroot'; we don't add it to the quorumServers String as we should.
this.zk = new ZooKeeper(quorumServers, sessionTimeout, watcher);
this.retryCounterFactory =
- new RetryCounterFactory(maxRetries, retryIntervalMillis);
+ new RetryCounterFactory(maxRetries+1, retryIntervalMillis);
if (identifier == null || identifier.length() == 0) {
// the identifier = processID@hostName
@@ -177,7 +177,6 @@ public class RecoverableZooKeeper {
}
}
retryCounter.sleepUntilNextRetry();
- retryCounter.useRetry();
isRetry = true;
}
} finally {
@@ -211,7 +210,6 @@ public class RecoverableZooKeeper {
}
}
retryCounter.sleepUntilNextRetry();
- retryCounter.useRetry();
}
} finally {
if (traceScope != null) traceScope.close();
@@ -244,7 +242,6 @@ public class RecoverableZooKeeper {
}
}
retryCounter.sleepUntilNextRetry();
- retryCounter.useRetry();
}
} finally {
if (traceScope != null) traceScope.close();
@@ -256,7 +253,7 @@ public class RecoverableZooKeeper {
LOG.warn("Possibly transient ZooKeeper, quorum=" + quorumServers + ", exception=" + e);
if (!retryCounter.shouldRetry()) {
LOG.error("ZooKeeper " + opName + " failed after "
- + retryCounter.getMaxRetries() + " retries");
+ + retryCounter.getMaxAttempts() + " attempts");
throw e;
}
}
@@ -287,7 +284,6 @@ public class RecoverableZooKeeper {
}
}
retryCounter.sleepUntilNextRetry();
- retryCounter.useRetry();
}
} finally {
if (traceScope != null) traceScope.close();
@@ -320,7 +316,6 @@ public class RecoverableZooKeeper {
}
}
retryCounter.sleepUntilNextRetry();
- retryCounter.useRetry();
}
} finally {
if (traceScope != null) traceScope.close();
@@ -354,7 +349,6 @@ public class RecoverableZooKeeper {
}
}
retryCounter.sleepUntilNextRetry();
- retryCounter.useRetry();
}
} finally {
if (traceScope != null) traceScope.close();
@@ -388,7 +382,6 @@ public class RecoverableZooKeeper {
}
}
retryCounter.sleepUntilNextRetry();
- retryCounter.useRetry();
}
} finally {
if (traceScope != null) traceScope.close();
@@ -440,7 +433,6 @@ public class RecoverableZooKeeper {
}
}
retryCounter.sleepUntilNextRetry();
- retryCounter.useRetry();
isRetry = true;
}
} finally {
@@ -528,7 +520,6 @@ public class RecoverableZooKeeper {
}
}
retryCounter.sleepUntilNextRetry();
- retryCounter.useRetry();
isRetry = true;
}
}
@@ -563,7 +554,6 @@ public class RecoverableZooKeeper {
}
}
retryCounter.sleepUntilNextRetry();
- retryCounter.useRetry();
}
}
/**
@@ -620,7 +610,6 @@ public class RecoverableZooKeeper {
}
}
retryCounter.sleepUntilNextRetry();
- retryCounter.useRetry();
}
} finally {
if (traceScope != null) traceScope.close();
Modified: hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounter.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounter.java?rev=1536910&r1=1536909&r2=1536910&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounter.java (original)
+++ hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounter.java Tue Oct 29 22:05:58 2013
@@ -26,44 +26,150 @@ import org.apache.hadoop.classification.
@InterfaceAudience.Private
public class RetryCounter {
+
+ /**
+ * Configuration for a retry counter
+ */
+ public static class RetryConfig {
+ private int maxAttempts;
+ private long sleepInterval;
+ private long maxSleepTime;
+ private TimeUnit timeUnit;
+ private BackoffPolicy backoffPolicy;
+
+ private static final BackoffPolicy DEFAULT_BACKOFF_POLICY = new ExponentialBackoffPolicy();
+
+ public RetryConfig() {
+ maxAttempts = 1;
+ sleepInterval = 1000;
+ maxSleepTime = -1;
+ timeUnit = TimeUnit.MILLISECONDS;
+ backoffPolicy = DEFAULT_BACKOFF_POLICY;
+ }
+
+ public RetryConfig(int maxAttempts, long sleepInterval, long maxSleepTime,
+ TimeUnit timeUnit, BackoffPolicy backoffPolicy) {
+ this.maxAttempts = maxAttempts;
+ this.sleepInterval = sleepInterval;
+ this.maxSleepTime = maxSleepTime;
+ this.timeUnit = timeUnit;
+ this.backoffPolicy = backoffPolicy;
+ }
+
+ public RetryConfig setBackoffPolicy(BackoffPolicy backoffPolicy) {
+ this.backoffPolicy = backoffPolicy;
+ return this;
+ }
+
+ public RetryConfig setMaxAttempts(int maxAttempts) {
+ this.maxAttempts = maxAttempts;
+ return this;
+ }
+
+ public RetryConfig setMaxSleepTime(long maxSleepTime) {
+ this.maxSleepTime = maxSleepTime;
+ return this;
+ }
+
+ public RetryConfig setSleepInterval(long sleepInterval) {
+ this.sleepInterval = sleepInterval;
+ return this;
+ }
+
+ public RetryConfig setTimeUnit(TimeUnit timeUnit) {
+ this.timeUnit = timeUnit;
+ return this;
+ }
+
+ public int getMaxAttempts() {
+ return maxAttempts;
+ }
+
+ public long getMaxSleepTime() {
+ return maxSleepTime;
+ }
+
+ public long getSleepInterval() {
+ return sleepInterval;
+ }
+
+ public TimeUnit getTimeUnit() {
+ return timeUnit;
+ }
+
+ public BackoffPolicy getBackoffPolicy() {
+ return backoffPolicy;
+ }
+ }
+
+ /**
+ * Policy for calculating sleeping intervals between retry attempts
+ */
+ public static class BackoffPolicy {
+ public long getBackoffTime(RetryConfig config, int attempts) {
+ return config.getSleepInterval();
+ }
+ }
+
+ public static class ExponentialBackoffPolicy extends BackoffPolicy {
+ @Override
+ public long getBackoffTime(RetryConfig config, int attempts) {
+ long backoffTime = (long) (config.getSleepInterval() * Math.pow(2, attempts));
+ return backoffTime;
+ }
+ }
+
+ public static class ExponentialBackoffPolicyWithLimit extends ExponentialBackoffPolicy {
+ @Override
+ public long getBackoffTime(RetryConfig config, int attempts) {
+ long backoffTime = super.getBackoffTime(config, attempts);
+ return config.getMaxSleepTime() > 0 ? Math.min(backoffTime, config.getMaxSleepTime()) : backoffTime;
+ }
+ }
+
private static final Log LOG = LogFactory.getLog(RetryCounter.class);
- private final int maxRetries;
- private int retriesRemaining;
- private final int retryIntervalMillis;
- private final TimeUnit timeUnit;
-
- public RetryCounter(int maxRetries,
- int retryIntervalMillis, TimeUnit timeUnit) {
- this.maxRetries = maxRetries;
- this.retriesRemaining = maxRetries;
- this.retryIntervalMillis = retryIntervalMillis;
- this.timeUnit = timeUnit;
+
+ private RetryConfig retryConfig;
+ private int attempts;
+
+ public RetryCounter(int maxAttempts, long sleepInterval, TimeUnit timeUnit) {
+ this(new RetryConfig(maxAttempts, sleepInterval, -1, timeUnit, new ExponentialBackoffPolicy()));
+ }
+
+ public RetryCounter(RetryConfig retryConfig) {
+ this.attempts = 0;
+ this.retryConfig = retryConfig;
}
- public int getMaxRetries() {
- return maxRetries;
+ public int getMaxAttempts() {
+ return retryConfig.getMaxAttempts();
}
/**
- * Sleep for a exponentially back off time
+ * Sleep for a back off time as supplied by the backoff policy, and increases the attempts
* @throws InterruptedException
*/
public void sleepUntilNextRetry() throws InterruptedException {
int attempts = getAttemptTimes();
- long sleepTime = (long) (retryIntervalMillis * Math.pow(2, attempts));
+ long sleepTime = retryConfig.backoffPolicy.getBackoffTime(retryConfig, attempts);
LOG.info("Sleeping " + sleepTime + "ms before retry #" + attempts + "...");
- timeUnit.sleep(sleepTime);
+ retryConfig.getTimeUnit().sleep(sleepTime);
+ useRetry();
}
public boolean shouldRetry() {
- return retriesRemaining > 0;
+ return attempts < retryConfig.getMaxAttempts();
}
public void useRetry() {
- retriesRemaining--;
+ attempts++;
+ }
+
+ public boolean isRetry() {
+ return attempts > 0;
}
-
+
public int getAttemptTimes() {
- return maxRetries-retriesRemaining+1;
+ return attempts;
}
}
Modified: hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java?rev=1536910&r1=1536909&r2=1536910&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java (original)
+++ hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java Tue Oct 29 22:05:58 2013
@@ -21,20 +21,27 @@ package org.apache.hadoop.hbase.util;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.hbase.util.RetryCounter.ExponentialBackoffPolicy;
+import org.apache.hadoop.hbase.util.RetryCounter.RetryConfig;
@InterfaceAudience.Private
public class RetryCounterFactory {
- private final int maxRetries;
- private final int retryIntervalMillis;
+ private final RetryConfig retryConfig;
- public RetryCounterFactory(int maxRetries, int retryIntervalMillis) {
- this.maxRetries = maxRetries;
- this.retryIntervalMillis = retryIntervalMillis;
+ public RetryCounterFactory(int maxAttempts, int sleepIntervalMillis) {
+ this(new RetryConfig(
+ maxAttempts,
+ sleepIntervalMillis,
+ -1,
+ TimeUnit.MILLISECONDS,
+ new ExponentialBackoffPolicy()));
+ }
+
+ public RetryCounterFactory(RetryConfig retryConfig) {
+ this.retryConfig = retryConfig;
}
public RetryCounter create() {
- return new RetryCounter(
- maxRetries, retryIntervalMillis, TimeUnit.MILLISECONDS
- );
+ return new RetryCounter(retryConfig);
}
}
Modified: hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java?rev=1536910&r1=1536909&r2=1536910&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java (original)
+++ hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java Tue Oct 29 22:05:58 2013
@@ -27,6 +27,9 @@ import org.apache.hadoop.classification.
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseClusterManager.CommandProvider.Operation;
import org.apache.hadoop.hbase.util.Pair;
+import org.apache.hadoop.hbase.util.RetryCounter;
+import org.apache.hadoop.hbase.util.RetryCounter.RetryConfig;
+import org.apache.hadoop.hbase.util.RetryCounterFactory;
import org.apache.hadoop.util.Shell;
/**
@@ -48,6 +51,14 @@ public class HBaseClusterManager extends
private static final String DEFAULT_TUNNEL_CMD = "/usr/bin/ssh %1$s %2$s%3$s%4$s \"%5$s\"";
private String tunnelCmd;
+ private static final String RETRY_ATTEMPTS_KEY = "hbase.it.clustermanager.retry.attempts";
+ private static final int DEFAULT_RETRY_ATTEMPTS = 5;
+
+ private static final String RETRY_SLEEP_INTERVAL_KEY = "hbase.it.clustermanager.retry.sleep.interval";
+ private static final int DEFAULT_RETRY_SLEEP_INTERVAL = 1000;
+
+ protected RetryCounterFactory retryCounterFactory;
+
@Override
public void setConf(Configuration conf) {
super.setConf(conf);
@@ -68,6 +79,10 @@ public class HBaseClusterManager extends
(sshOptions != null && sshOptions.length() > 0)) {
LOG.info("Running with SSH user [" + sshUserName + "] and options [" + sshOptions + "]");
}
+
+ this.retryCounterFactory = new RetryCounterFactory(new RetryConfig()
+ .setMaxAttempts(conf.getInt(RETRY_ATTEMPTS_KEY, DEFAULT_RETRY_ATTEMPTS))
+ .setSleepInterval(conf.getLong(RETRY_SLEEP_INTERVAL_KEY, DEFAULT_RETRY_SLEEP_INTERVAL)));
}
/**
@@ -184,7 +199,15 @@ public class HBaseClusterManager extends
LOG.info("Executing remote command: " + StringUtils.join(cmd, " ") + " , hostname:" + hostname);
RemoteShell shell = new RemoteShell(hostname, cmd);
- shell.execute();
+ try {
+ shell.execute();
+ } catch (Shell.ExitCodeException ex) {
+ // capture the stdout of the process as well.
+ String output = shell.getOutput();
+ // add output for the ExitCodeException.
+ throw new Shell.ExitCodeException(ex.getExitCode(), "stderr: " + ex.getMessage()
+ + ", stdout: " + output);
+ }
LOG.info("Executed remote command, exit code:" + shell.getExitCode()
+ " , output:" + shell.getOutput());
@@ -192,8 +215,37 @@ public class HBaseClusterManager extends
return new Pair<Integer, String>(shell.getExitCode(), shell.getOutput());
}
+ private Pair<Integer, String> execWithRetries(String hostname, String... cmd)
+ throws IOException {
+ RetryCounter retryCounter = retryCounterFactory.create();
+ while (true) {
+ try {
+ return exec(hostname, cmd);
+ } catch (IOException e) {
+ retryOrThrow(retryCounter, e, hostname, cmd);
+ }
+ try {
+ retryCounter.sleepUntilNextRetry();
+ } catch (InterruptedException ex) {
+ // ignore
+ LOG.warn("Sleep Interrupted:" + ex);
+ }
+ }
+ }
+
+ private <E extends Exception> void retryOrThrow(RetryCounter retryCounter, E ex,
+ String hostname, String[] cmd) throws E {
+ if (retryCounter.shouldRetry()) {
+ LOG.warn("Remote command: " + StringUtils.join(cmd, " ") + " , hostname:" + hostname
+ + " failed at attempt " + retryCounter.getAttemptTimes() + ". Retrying until maxAttempts: "
+ + retryCounter.getMaxAttempts() + ". Exception: " + ex.getMessage());
+ return;
+ }
+ throw ex;
+ }
+
private void exec(String hostname, ServiceType service, Operation op) throws IOException {
- exec(hostname, getCommandProvider(service).getCommand(service, op));
+ execWithRetries(hostname, getCommandProvider(service).getCommand(service, op));
}
@Override
@@ -213,12 +265,12 @@ public class HBaseClusterManager extends
@Override
public void signal(ServiceType service, String signal, String hostname) throws IOException {
- exec(hostname, getCommandProvider(service).signalCommand(service, signal));
+ execWithRetries(hostname, getCommandProvider(service).signalCommand(service, signal));
}
@Override
public boolean isRunning(ServiceType service, String hostname) throws IOException {
- String ret = exec(hostname, getCommandProvider(service).isRunningCommand(service))
+ String ret = execWithRetries(hostname, getCommandProvider(service).isRunningCommand(service))
.getSecond();
return ret.length() > 0;
}
Modified: hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRegionsOfTableAction.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRegionsOfTableAction.java?rev=1536910&r1=1536909&r2=1536910&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRegionsOfTableAction.java (original)
+++ hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRegionsOfTableAction.java Tue Oct 29 22:05:58 2013
@@ -19,6 +19,7 @@
package org.apache.hadoop.hbase.chaos.actions;
import java.util.Collection;
+import java.util.Collections;
import java.util.List;
import org.apache.commons.lang.math.RandomUtils;
@@ -34,6 +35,7 @@ public class MoveRegionsOfTableAction ex
private final long sleepTime;
private final byte[] tableNameBytes;
private final String tableName;
+ private final long maxTime;
public MoveRegionsOfTableAction(String tableName) {
this(-1, tableName);
@@ -43,6 +45,7 @@ public class MoveRegionsOfTableAction ex
this.sleepTime = sleepTime;
this.tableNameBytes = Bytes.toBytes(tableName);
this.tableName = tableName;
+ this.maxTime = 10 * 60 * 1000; // 10 min default
}
@Override
@@ -62,6 +65,9 @@ public class MoveRegionsOfTableAction ex
return;
}
+ Collections.shuffle(regions);
+
+ long start = System.currentTimeMillis();
for (HRegionInfo regionInfo:regions) {
try {
String destServerName =
@@ -74,6 +80,12 @@ public class MoveRegionsOfTableAction ex
if (sleepTime > 0) {
Thread.sleep(sleepTime);
}
+
+ // put a limit on max num regions. Otherwise, this won't finish
+ // with a sleep time of 10sec, 100 regions will finish in 16min
+ if (System.currentTimeMillis() - start > maxTime) {
+ break;
+ }
}
}
}
Modified: hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java?rev=1536910&r1=1536909&r2=1536910&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java (original)
+++ hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java Tue Oct 29 22:05:58 2013
@@ -145,6 +145,7 @@ public class PolicyBasedChaosMonkey exte
return;
}
for (Thread monkeyThread : monkeyThreads) {
+ // TODO: bound the wait time per policy
monkeyThread.join();
}
}
Modified: hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java?rev=1536910&r1=1536909&r2=1536910&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java (original)
+++ hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java Tue Oct 29 22:05:58 2013
@@ -2019,7 +2019,7 @@ public class HBaseTestingUtility extends
HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
int numRetries = getConfiguration().getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
- RetryCounter retrier = new RetryCounter(numRetries, (int)pause, TimeUnit.MICROSECONDS);
+ RetryCounter retrier = new RetryCounter(numRetries+1, (int)pause, TimeUnit.MICROSECONDS);
while(retrier.shouldRetry()) {
int index = getMiniHBaseCluster().getServerWith(firstrow);
if (index != -1) {