You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by en...@apache.org on 2013/10/29 23:05:58 UTC

svn commit: r1536910 - in /hbase/branches/0.96: hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ hbase-common/src/main/java/org/apache/hadoop/hbase/util/ hbase-it/src/test/java/org/apache/hadoop/hbase/ hbase-it/src/test/java/org/apache/had...

Author: enis
Date: Tue Oct 29 22:05:58 2013
New Revision: 1536910

URL: http://svn.apache.org/r1536910
Log:
HBASE-9750 Add retries around Action server stop/start

Modified:
    hbase/branches/0.96/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
    hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounter.java
    hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java
    hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
    hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRegionsOfTableAction.java
    hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
    hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java

Modified: hbase/branches/0.96/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java?rev=1536910&r1=1536909&r2=1536910&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java (original)
+++ hbase/branches/0.96/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java Tue Oct 29 22:05:58 2013
@@ -111,7 +111,7 @@ public class RecoverableZooKeeper {
     // TODO: Add support for zk 'chroot'; we don't add it to the quorumServers String as we should.
     this.zk = new ZooKeeper(quorumServers, sessionTimeout, watcher);
     this.retryCounterFactory =
-      new RetryCounterFactory(maxRetries, retryIntervalMillis);
+      new RetryCounterFactory(maxRetries+1, retryIntervalMillis);
 
     if (identifier == null || identifier.length() == 0) {
       // the identifier = processID@hostName
@@ -177,7 +177,6 @@ public class RecoverableZooKeeper {
           }
         }
         retryCounter.sleepUntilNextRetry();
-        retryCounter.useRetry();
         isRetry = true;
       }
     } finally {
@@ -211,7 +210,6 @@ public class RecoverableZooKeeper {
           }
         }
         retryCounter.sleepUntilNextRetry();
-        retryCounter.useRetry();
       }
     } finally {
       if (traceScope != null) traceScope.close();
@@ -244,7 +242,6 @@ public class RecoverableZooKeeper {
           }
         }
         retryCounter.sleepUntilNextRetry();
-        retryCounter.useRetry();
       }
     } finally {
       if (traceScope != null) traceScope.close();
@@ -256,7 +253,7 @@ public class RecoverableZooKeeper {
     LOG.warn("Possibly transient ZooKeeper, quorum=" + quorumServers + ", exception=" + e);
     if (!retryCounter.shouldRetry()) {
       LOG.error("ZooKeeper " + opName + " failed after "
-        + retryCounter.getMaxRetries() + " retries");
+        + retryCounter.getMaxAttempts() + " attempts");
       throw e;
     }
   }
@@ -287,7 +284,6 @@ public class RecoverableZooKeeper {
           }
         }
         retryCounter.sleepUntilNextRetry();
-        retryCounter.useRetry();
       }
     } finally {
       if (traceScope != null) traceScope.close();
@@ -320,7 +316,6 @@ public class RecoverableZooKeeper {
           }
         }
         retryCounter.sleepUntilNextRetry();
-        retryCounter.useRetry();
       }
     } finally {
       if (traceScope != null) traceScope.close();
@@ -354,7 +349,6 @@ public class RecoverableZooKeeper {
           }
         }
         retryCounter.sleepUntilNextRetry();
-        retryCounter.useRetry();
       }
     } finally {
       if (traceScope != null) traceScope.close();
@@ -388,7 +382,6 @@ public class RecoverableZooKeeper {
           }
         }
         retryCounter.sleepUntilNextRetry();
-        retryCounter.useRetry();
       }
     } finally {
       if (traceScope != null) traceScope.close();
@@ -440,7 +433,6 @@ public class RecoverableZooKeeper {
           }
         }
         retryCounter.sleepUntilNextRetry();
-        retryCounter.useRetry();
         isRetry = true;
       }
     } finally {
@@ -528,7 +520,6 @@ public class RecoverableZooKeeper {
         }
       }
       retryCounter.sleepUntilNextRetry();
-      retryCounter.useRetry();
       isRetry = true;
     }
   }
@@ -563,7 +554,6 @@ public class RecoverableZooKeeper {
         }
       }
       retryCounter.sleepUntilNextRetry();
-      retryCounter.useRetry();
     }
   }
   /**
@@ -620,7 +610,6 @@ public class RecoverableZooKeeper {
           }
         }
         retryCounter.sleepUntilNextRetry();
-        retryCounter.useRetry();
     }
     } finally {
       if (traceScope != null) traceScope.close();

Modified: hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounter.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounter.java?rev=1536910&r1=1536909&r2=1536910&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounter.java (original)
+++ hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounter.java Tue Oct 29 22:05:58 2013
@@ -26,44 +26,150 @@ import org.apache.hadoop.classification.
 
 @InterfaceAudience.Private
 public class RetryCounter {
+
+  /**
+   *  Configuration for a retry counter
+   */
+  public static class RetryConfig {
+    private int maxAttempts;
+    private long sleepInterval;
+    private long maxSleepTime;
+    private TimeUnit timeUnit;
+    private BackoffPolicy backoffPolicy;
+
+    private static final BackoffPolicy DEFAULT_BACKOFF_POLICY = new ExponentialBackoffPolicy();
+
+    public RetryConfig() {
+      maxAttempts    = 1;
+      sleepInterval = 1000;
+      maxSleepTime  = -1;
+      timeUnit = TimeUnit.MILLISECONDS;
+      backoffPolicy = DEFAULT_BACKOFF_POLICY;
+    }
+
+    public RetryConfig(int maxAttempts, long sleepInterval, long maxSleepTime,
+        TimeUnit timeUnit, BackoffPolicy backoffPolicy) {
+      this.maxAttempts = maxAttempts;
+      this.sleepInterval = sleepInterval;
+      this.maxSleepTime = maxSleepTime;
+      this.timeUnit = timeUnit;
+      this.backoffPolicy = backoffPolicy;
+    }
+
+    public RetryConfig setBackoffPolicy(BackoffPolicy backoffPolicy) {
+      this.backoffPolicy = backoffPolicy;
+      return this;
+    }
+
+    public RetryConfig setMaxAttempts(int maxAttempts) {
+      this.maxAttempts = maxAttempts;
+      return this;
+    }
+
+    public RetryConfig setMaxSleepTime(long maxSleepTime) {
+      this.maxSleepTime = maxSleepTime;
+      return this;
+    }
+
+    public RetryConfig setSleepInterval(long sleepInterval) {
+      this.sleepInterval = sleepInterval;
+      return this;
+    }
+
+    public RetryConfig setTimeUnit(TimeUnit timeUnit) {
+      this.timeUnit = timeUnit;
+      return this;
+    }
+
+    public int getMaxAttempts() {
+      return maxAttempts;
+    }
+
+    public long getMaxSleepTime() {
+      return maxSleepTime;
+    }
+
+    public long getSleepInterval() {
+      return sleepInterval;
+    }
+
+    public TimeUnit getTimeUnit() {
+      return timeUnit;
+    }
+
+    public BackoffPolicy getBackoffPolicy() {
+      return backoffPolicy;
+    }
+  }
+
+  /**
+   * Policy for calculating sleeping intervals between retry attempts
+   */
+  public static class BackoffPolicy {
+    public long getBackoffTime(RetryConfig config, int attempts) {
+      return config.getSleepInterval();
+    }
+  }
+
+  public static class ExponentialBackoffPolicy extends BackoffPolicy {
+    @Override
+    public long getBackoffTime(RetryConfig config, int attempts) {
+      long backoffTime = (long) (config.getSleepInterval() * Math.pow(2, attempts));
+      return backoffTime;
+    }
+  }
+
+  public static class ExponentialBackoffPolicyWithLimit extends ExponentialBackoffPolicy {
+    @Override
+    public long getBackoffTime(RetryConfig config, int attempts) {
+      long backoffTime = super.getBackoffTime(config, attempts);
+      return config.getMaxSleepTime() > 0 ? Math.min(backoffTime, config.getMaxSleepTime()) : backoffTime;
+    }
+  }
+
   private static final Log LOG = LogFactory.getLog(RetryCounter.class);
-  private final int maxRetries;
-  private int retriesRemaining;
-  private final int retryIntervalMillis;
-  private final TimeUnit timeUnit;
-
-  public RetryCounter(int maxRetries, 
-  int retryIntervalMillis, TimeUnit timeUnit) {
-    this.maxRetries = maxRetries;
-    this.retriesRemaining = maxRetries;
-    this.retryIntervalMillis = retryIntervalMillis;
-    this.timeUnit = timeUnit;
+
+  private RetryConfig retryConfig;
+  private int attempts;
+
+  public RetryCounter(int maxAttempts, long sleepInterval, TimeUnit timeUnit) {
+    this(new RetryConfig(maxAttempts, sleepInterval, -1, timeUnit, new ExponentialBackoffPolicy()));
+  }
+
+  public RetryCounter(RetryConfig retryConfig) {
+    this.attempts = 0;
+    this.retryConfig = retryConfig;
   }
 
-  public int getMaxRetries() {
-    return maxRetries;
+  public int getMaxAttempts() {
+    return retryConfig.getMaxAttempts();
   }
 
   /**
-   * Sleep for a exponentially back off time
+   * Sleep for a back off time as supplied by the backoff policy, and increases the attempts
    * @throws InterruptedException
    */
   public void sleepUntilNextRetry() throws InterruptedException {
     int attempts = getAttemptTimes();
-    long sleepTime = (long) (retryIntervalMillis * Math.pow(2, attempts));
+    long sleepTime = retryConfig.backoffPolicy.getBackoffTime(retryConfig, attempts);
     LOG.info("Sleeping " + sleepTime + "ms before retry #" + attempts + "...");
-    timeUnit.sleep(sleepTime);
+    retryConfig.getTimeUnit().sleep(sleepTime);
+    useRetry();
   }
 
   public boolean shouldRetry() {
-    return retriesRemaining > 0;
+    return attempts < retryConfig.getMaxAttempts();
   }
 
   public void useRetry() {
-    retriesRemaining--;
+    attempts++;
+  }
+
+  public boolean isRetry() {
+    return attempts > 0;
   }
-  
+
   public int getAttemptTimes() {
-    return maxRetries-retriesRemaining+1;
+    return attempts;
   }
 }

Modified: hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java?rev=1536910&r1=1536909&r2=1536910&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java (original)
+++ hbase/branches/0.96/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java Tue Oct 29 22:05:58 2013
@@ -21,20 +21,27 @@ package org.apache.hadoop.hbase.util;
 import java.util.concurrent.TimeUnit;
 
 import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.hbase.util.RetryCounter.ExponentialBackoffPolicy;
+import org.apache.hadoop.hbase.util.RetryCounter.RetryConfig;
 
 @InterfaceAudience.Private
 public class RetryCounterFactory {
-  private final int maxRetries;
-  private final int retryIntervalMillis;
+  private final RetryConfig retryConfig;
 
-  public RetryCounterFactory(int maxRetries, int retryIntervalMillis) {
-    this.maxRetries = maxRetries;
-    this.retryIntervalMillis = retryIntervalMillis;
+  public RetryCounterFactory(int maxAttempts, int sleepIntervalMillis) {
+    this(new RetryConfig(
+      maxAttempts,
+      sleepIntervalMillis,
+      -1,
+      TimeUnit.MILLISECONDS,
+      new ExponentialBackoffPolicy()));
+  }
+
+  public RetryCounterFactory(RetryConfig retryConfig) {
+    this.retryConfig = retryConfig;
   }
 
   public RetryCounter create() {
-    return new RetryCounter(
-      maxRetries, retryIntervalMillis, TimeUnit.MILLISECONDS
-    );
+    return new RetryCounter(retryConfig);
   }
 }

Modified: hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java?rev=1536910&r1=1536909&r2=1536910&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java (original)
+++ hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java Tue Oct 29 22:05:58 2013
@@ -27,6 +27,9 @@ import org.apache.hadoop.classification.
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.HBaseClusterManager.CommandProvider.Operation;
 import org.apache.hadoop.hbase.util.Pair;
+import org.apache.hadoop.hbase.util.RetryCounter;
+import org.apache.hadoop.hbase.util.RetryCounter.RetryConfig;
+import org.apache.hadoop.hbase.util.RetryCounterFactory;
 import org.apache.hadoop.util.Shell;
 
 /**
@@ -48,6 +51,14 @@ public class HBaseClusterManager extends
   private static final String DEFAULT_TUNNEL_CMD = "/usr/bin/ssh %1$s %2$s%3$s%4$s \"%5$s\"";
   private String tunnelCmd;
 
+  private static final String RETRY_ATTEMPTS_KEY = "hbase.it.clustermanager.retry.attempts";
+  private static final int DEFAULT_RETRY_ATTEMPTS = 5;
+
+  private static final String RETRY_SLEEP_INTERVAL_KEY = "hbase.it.clustermanager.retry.sleep.interval";
+  private static final int DEFAULT_RETRY_SLEEP_INTERVAL = 1000;
+
+  protected RetryCounterFactory retryCounterFactory;
+
   @Override
   public void setConf(Configuration conf) {
     super.setConf(conf);
@@ -68,6 +79,10 @@ public class HBaseClusterManager extends
         (sshOptions != null && sshOptions.length() > 0)) {
       LOG.info("Running with SSH user [" + sshUserName + "] and options [" + sshOptions + "]");
     }
+
+    this.retryCounterFactory = new RetryCounterFactory(new RetryConfig()
+        .setMaxAttempts(conf.getInt(RETRY_ATTEMPTS_KEY, DEFAULT_RETRY_ATTEMPTS))
+        .setSleepInterval(conf.getLong(RETRY_SLEEP_INTERVAL_KEY, DEFAULT_RETRY_SLEEP_INTERVAL)));
   }
 
   /**
@@ -184,7 +199,15 @@ public class HBaseClusterManager extends
     LOG.info("Executing remote command: " + StringUtils.join(cmd, " ") + " , hostname:" + hostname);
 
     RemoteShell shell = new RemoteShell(hostname, cmd);
-    shell.execute();
+    try {
+      shell.execute();
+    } catch (Shell.ExitCodeException ex) {
+      // capture the stdout of the process as well.
+      String output = shell.getOutput();
+      // add output for the ExitCodeException.
+      throw new Shell.ExitCodeException(ex.getExitCode(), "stderr: " + ex.getMessage()
+        + ", stdout: " + output);
+    }
 
     LOG.info("Executed remote command, exit code:" + shell.getExitCode()
         + " , output:" + shell.getOutput());
@@ -192,8 +215,37 @@ public class HBaseClusterManager extends
     return new Pair<Integer, String>(shell.getExitCode(), shell.getOutput());
   }
 
+  private Pair<Integer, String> execWithRetries(String hostname, String... cmd)
+      throws IOException {
+    RetryCounter retryCounter = retryCounterFactory.create();
+    while (true) {
+      try {
+        return exec(hostname, cmd);
+      } catch (IOException e) {
+        retryOrThrow(retryCounter, e, hostname, cmd);
+      }
+      try {
+        retryCounter.sleepUntilNextRetry();
+      } catch (InterruptedException ex) {
+        // ignore
+        LOG.warn("Sleep Interrupted:" + ex);
+      }
+    }
+  }
+
+  private <E extends Exception> void retryOrThrow(RetryCounter retryCounter, E ex,
+      String hostname, String[] cmd) throws E {
+    if (retryCounter.shouldRetry()) {
+      LOG.warn("Remote command: " + StringUtils.join(cmd, " ") + " , hostname:" + hostname
+        + " failed at attempt " + retryCounter.getAttemptTimes() + ". Retrying until maxAttempts: "
+          + retryCounter.getMaxAttempts() + ". Exception: " + ex.getMessage());
+      return;
+    }
+    throw ex;
+  }
+
   private void exec(String hostname, ServiceType service, Operation op) throws IOException {
-    exec(hostname, getCommandProvider(service).getCommand(service, op));
+    execWithRetries(hostname, getCommandProvider(service).getCommand(service, op));
   }
 
   @Override
@@ -213,12 +265,12 @@ public class HBaseClusterManager extends
 
   @Override
   public void signal(ServiceType service, String signal, String hostname) throws IOException {
-    exec(hostname, getCommandProvider(service).signalCommand(service, signal));
+    execWithRetries(hostname, getCommandProvider(service).signalCommand(service, signal));
   }
 
   @Override
   public boolean isRunning(ServiceType service, String hostname) throws IOException {
-    String ret = exec(hostname, getCommandProvider(service).isRunningCommand(service))
+    String ret = execWithRetries(hostname, getCommandProvider(service).isRunningCommand(service))
         .getSecond();
     return ret.length() > 0;
   }

Modified: hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRegionsOfTableAction.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRegionsOfTableAction.java?rev=1536910&r1=1536909&r2=1536910&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRegionsOfTableAction.java (original)
+++ hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/MoveRegionsOfTableAction.java Tue Oct 29 22:05:58 2013
@@ -19,6 +19,7 @@
 package org.apache.hadoop.hbase.chaos.actions;
 
 import java.util.Collection;
+import java.util.Collections;
 import java.util.List;
 
 import org.apache.commons.lang.math.RandomUtils;
@@ -34,6 +35,7 @@ public class MoveRegionsOfTableAction ex
   private final long sleepTime;
   private final byte[] tableNameBytes;
   private final String tableName;
+  private final long maxTime;
 
   public MoveRegionsOfTableAction(String tableName) {
     this(-1, tableName);
@@ -43,6 +45,7 @@ public class MoveRegionsOfTableAction ex
     this.sleepTime = sleepTime;
     this.tableNameBytes = Bytes.toBytes(tableName);
     this.tableName = tableName;
+    this.maxTime = 10 * 60 * 1000; // 10 min default
   }
 
   @Override
@@ -62,6 +65,9 @@ public class MoveRegionsOfTableAction ex
       return;
     }
 
+    Collections.shuffle(regions);
+
+    long start = System.currentTimeMillis();
     for (HRegionInfo regionInfo:regions) {
       try {
         String destServerName =
@@ -74,6 +80,12 @@ public class MoveRegionsOfTableAction ex
       if (sleepTime > 0) {
         Thread.sleep(sleepTime);
       }
+
+      // put a limit on max num regions. Otherwise, this won't finish
+      // with a sleep time of 10sec, 100 regions will finish in 16min
+      if (System.currentTimeMillis() - start > maxTime) {
+        break;
+      }
     }
   }
 }

Modified: hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java?rev=1536910&r1=1536909&r2=1536910&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java (original)
+++ hbase/branches/0.96/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java Tue Oct 29 22:05:58 2013
@@ -145,6 +145,7 @@ public class PolicyBasedChaosMonkey exte
       return;
     }
     for (Thread monkeyThread : monkeyThreads) {
+      // TODO: bound the wait time per policy
       monkeyThread.join();
     }
   }

Modified: hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java?rev=1536910&r1=1536909&r2=1536910&view=diff
==============================================================================
--- hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java (original)
+++ hbase/branches/0.96/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java Tue Oct 29 22:05:58 2013
@@ -2019,7 +2019,7 @@ public class HBaseTestingUtility extends
       HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
     int numRetries = getConfiguration().getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
       HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER);
-    RetryCounter retrier = new RetryCounter(numRetries, (int)pause, TimeUnit.MICROSECONDS);
+    RetryCounter retrier = new RetryCounter(numRetries+1, (int)pause, TimeUnit.MICROSECONDS);
     while(retrier.shouldRetry()) {
       int index = getMiniHBaseCluster().getServerWith(firstrow);
       if (index != -1) {