You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by ib...@apache.org on 2018/02/01 23:26:57 UTC

[accumulo] branch 1.7 updated: ACCUMULO-4777 Refine retry logging

This is an automated email from the ASF dual-hosted git repository.

ibella pushed a commit to branch 1.7
in repository https://gitbox.apache.org/repos/asf/accumulo.git


The following commit(s) were added to refs/heads/1.7 by this push:
     new b52db47  ACCUMULO-4777 Refine retry logging
b52db47 is described below

commit b52db479e9c4e371a94b1f019e818622562e3dc3
Author: Ivan Bella <iv...@bella.name>
AuthorDate: Fri Jan 26 09:31:11 2018 -0500

    ACCUMULO-4777 Refine retry logging
    
    * Reverted to only log the warning message after attempt 1.
    * Fixed log messages to display 1 based attempt number.
    * Avoid logging warning on every retry.
---
 .../org/apache/accumulo/fate/zookeeper/Retry.java  | 60 ++++++++++++++++++++--
 .../accumulo/fate/zookeeper/RetryFactory.java      | 17 ++++--
 .../accumulo/fate/zookeeper/RetryFactoryTest.java  | 10 ++--
 .../apache/accumulo/fate/zookeeper/RetryTest.java  | 32 ++++++++++--
 .../org/apache/accumulo/tserver/TabletServer.java  |  5 +-
 .../accumulo/tserver/log/TabletServerLogger.java   |  5 +-
 6 files changed, 108 insertions(+), 21 deletions(-)

diff --git a/fate/src/main/java/org/apache/accumulo/fate/zookeeper/Retry.java b/fate/src/main/java/org/apache/accumulo/fate/zookeeper/Retry.java
index 1f55d72..d710e2c 100644
--- a/fate/src/main/java/org/apache/accumulo/fate/zookeeper/Retry.java
+++ b/fate/src/main/java/org/apache/accumulo/fate/zookeeper/Retry.java
@@ -19,6 +19,8 @@ package org.apache.accumulo.fate.zookeeper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.annotations.VisibleForTesting;
+
 /**
  * Encapsulates the retrying implementation for some operation. Provides bounded retry attempts with a bounded, linear backoff.
  */
@@ -30,6 +32,9 @@ public class Retry {
   private long maxRetries, maxWait, waitIncrement;
   private long retriesDone, currentWait;
 
+  private long logInterval;
+  private long lastRetryLog;
+
   /**
    * @param maxRetries
    *          Maximum times to retry or MAX_RETRY_DISABLED if no maximum
@@ -39,13 +44,18 @@ public class Retry {
    *          The maximum wait (ms)
    * @param waitIncrement
    *          The amount of time (ms) to increment next wait time by
+   * @param logInterval
+   *          The amount of time (ms) between logging retries
    */
-  public Retry(long maxRetries, long startWait, long waitIncrement, long maxWait) {
+
+  public Retry(long maxRetries, long startWait, long waitIncrement, long maxWait, long logInterval) {
     this.maxRetries = maxRetries;
     this.maxWait = maxWait;
     this.waitIncrement = waitIncrement;
     this.retriesDone = 0l;
     this.currentWait = startWait;
+    this.logInterval = logInterval;
+    this.lastRetryLog = -1;
   }
 
   /**
@@ -55,47 +65,57 @@ public class Retry {
    *          The maximum wait (ms)
    * @param waitIncrement
    *          The amount of time (ms) to increment next wait time by
+   * @param logInterval
+   *          The amount of time (ms) between logging retries
    */
-  public Retry(long startWait, long waitIncrement, long maxWait) {
-    this(MAX_RETRY_DISABLED, startWait, waitIncrement, maxWait);
+  public Retry(long startWait, long waitIncrement, long maxWait, long logInterval) {
+    this(MAX_RETRY_DISABLED, startWait, waitIncrement, maxWait, logInterval);
   }
 
   // Visible for testing
+  @VisibleForTesting
   long getMaxRetries() {
     return maxRetries;
   }
 
   // Visible for testing
+  @VisibleForTesting
   long getCurrentWait() {
     return currentWait;
   }
 
   // Visible for testing
+  @VisibleForTesting
   long getWaitIncrement() {
     return waitIncrement;
   }
 
   // Visible for testing
+  @VisibleForTesting
   long getMaxWait() {
     return maxWait;
   }
 
   // Visible for testing
+  @VisibleForTesting
   void setMaxRetries(long maxRetries) {
     this.maxRetries = maxRetries;
   }
 
   // Visible for testing
+  @VisibleForTesting
   void setStartWait(long startWait) {
     this.currentWait = startWait;
   }
 
   // Visible for testing
+  @VisibleForTesting
   void setWaitIncrement(long waitIncrement) {
     this.waitIncrement = waitIncrement;
   }
 
   // Visible for testing
+  @VisibleForTesting
   void setMaxWait(long maxWait) {
     this.maxWait = maxWait;
   }
@@ -104,6 +124,15 @@ public class Retry {
     return maxRetries < 0;
   }
 
+  // Visible for testing
+  void setLogInterval(long logInterval) {
+    this.logInterval = logInterval;
+  }
+
+  public long getLogInterval() {
+    return logInterval;
+  }
+
   public boolean canRetry() {
     return isMaxRetryDisabled() || (retriesDone < maxRetries);
   }
@@ -125,7 +154,9 @@ public class Retry {
   }
 
   public void waitForNextAttempt() throws InterruptedException {
-    log.debug("Sleeping for " + currentWait + "ms before retrying operation");
+    if (log.isDebugEnabled()) {
+      log.debug("Sleeping for " + currentWait + "ms before retrying operation");
+    }
     sleep(currentWait);
     currentWait = Math.min(maxWait, currentWait + waitIncrement);
   }
@@ -133,4 +164,25 @@ public class Retry {
   protected void sleep(long wait) throws InterruptedException {
     Thread.sleep(wait);
   }
+
+  public void logRetry(Logger log, String message, Throwable t) {
+    // log the first time, and then after every logInterval
+    if (lastRetryLog < 0 || (System.currentTimeMillis() - lastRetryLog) > logInterval) {
+      log.warn(getMessage(message), t);
+      lastRetryLog = System.currentTimeMillis();
+    }
+  }
+
+  public void logRetry(Logger log, String message) {
+    // log the first time, and then after every logInterval
+    if (lastRetryLog < 0 || (System.currentTimeMillis() - lastRetryLog) > logInterval) {
+      log.warn(getMessage(message));
+      lastRetryLog = System.currentTimeMillis();
+    }
+  }
+
+  private String getMessage(String message) {
+    return message + ", retrying attempt " + (retriesDone + 1) + " (suppressing retry messages for " + logInterval + "ms)";
+  }
+
 }
diff --git a/fate/src/main/java/org/apache/accumulo/fate/zookeeper/RetryFactory.java b/fate/src/main/java/org/apache/accumulo/fate/zookeeper/RetryFactory.java
index aa6da20..e56eddc 100644
--- a/fate/src/main/java/org/apache/accumulo/fate/zookeeper/RetryFactory.java
+++ b/fate/src/main/java/org/apache/accumulo/fate/zookeeper/RetryFactory.java
@@ -21,9 +21,12 @@ package org.apache.accumulo.fate.zookeeper;
  */
 public class RetryFactory {
   public static final long DEFAULT_MAX_RETRIES = 10l, DEFAULT_START_WAIT = 250l, DEFAULT_WAIT_INCREMENT = 250l, DEFAULT_MAX_WAIT = 5000l;
-  public static final RetryFactory DEFAULT_INSTANCE = new RetryFactory(DEFAULT_MAX_RETRIES, DEFAULT_START_WAIT, DEFAULT_WAIT_INCREMENT, DEFAULT_MAX_WAIT);
+  // default logging interval: 3 minutes
+  public static final long DEFAULT_LOG_INTERVAL = 3 * 60 * 1000;
+  public static final RetryFactory DEFAULT_INSTANCE = new RetryFactory(DEFAULT_MAX_RETRIES, DEFAULT_START_WAIT, DEFAULT_WAIT_INCREMENT, DEFAULT_MAX_WAIT,
+      DEFAULT_LOG_INTERVAL);
 
-  private final long maxRetries, startWait, maxWait, waitIncrement;
+  private final long maxRetries, startWait, maxWait, waitIncrement, logInterval;
 
   /**
    * Create a retry factor for retries with a limit
@@ -36,12 +39,15 @@ public class RetryFactory {
    *          The amount of ms to increment the wait on subsequent retries
    * @param maxWait
    *          The max amount of wait time between retries
+   * @param logInterval
+   *          The amount of time (ms) between logging retries
    */
-  public RetryFactory(long maxRetries, long startWait, long waitIncrement, long maxWait) {
+  public RetryFactory(long maxRetries, long startWait, long waitIncrement, long maxWait, long logInterval) {
     this.maxRetries = maxRetries;
     this.startWait = startWait;
     this.maxWait = maxWait;
     this.waitIncrement = waitIncrement;
+    this.logInterval = logInterval;
   }
 
   /**
@@ -54,14 +60,15 @@ public class RetryFactory {
    * @param maxWait
    *          The max amount of wait time between retries
    */
-  public RetryFactory(long startWait, long waitIncrement, long maxWait) {
+  public RetryFactory(long startWait, long waitIncrement, long maxWait, long logInterval) {
     this.maxRetries = Retry.MAX_RETRY_DISABLED;
     this.startWait = startWait;
     this.maxWait = maxWait;
     this.waitIncrement = waitIncrement;
+    this.logInterval = logInterval;
   }
 
   public Retry create() {
-    return new Retry(maxRetries, startWait, waitIncrement, maxWait);
+    return new Retry(maxRetries, startWait, waitIncrement, maxWait, logInterval);
   }
 }
diff --git a/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryFactoryTest.java b/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryFactoryTest.java
index 9ba19a4..cc5d4bc 100644
--- a/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryFactoryTest.java
+++ b/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryFactoryTest.java
@@ -26,26 +26,28 @@ public class RetryFactoryTest {
 
   @Test
   public void properArgumentsInRetry() {
-    long maxRetries = 10, startWait = 50l, maxWait = 5000l, waitIncrement = 500l;
-    RetryFactory factory = new RetryFactory(maxRetries, startWait, waitIncrement, maxWait);
+    long maxRetries = 10, startWait = 50l, maxWait = 5000l, waitIncrement = 500l, logInterval = 10000l;
+    RetryFactory factory = new RetryFactory(maxRetries, startWait, waitIncrement, maxWait, logInterval);
     Retry retry = factory.create();
 
     Assert.assertEquals(maxRetries, retry.getMaxRetries());
     Assert.assertEquals(startWait, retry.getCurrentWait());
     Assert.assertEquals(maxWait, retry.getMaxWait());
     Assert.assertEquals(waitIncrement, retry.getWaitIncrement());
+    Assert.assertEquals(logInterval, retry.getLogInterval());
   }
 
   @Test
   public void properArgumentsInUnlimitedRetry() {
-    long startWait = 50l, maxWait = 5000l, waitIncrement = 500l;
-    RetryFactory factory = new RetryFactory(startWait, waitIncrement, maxWait);
+    long startWait = 50l, maxWait = 5000l, waitIncrement = 500l, logInterval = 10000l;
+    RetryFactory factory = new RetryFactory(startWait, waitIncrement, maxWait, logInterval);
     Retry retry = factory.create();
 
     Assert.assertEquals(Retry.MAX_RETRY_DISABLED, retry.getMaxRetries());
     Assert.assertEquals(startWait, retry.getCurrentWait());
     Assert.assertEquals(maxWait, retry.getMaxWait());
     Assert.assertEquals(waitIncrement, retry.getWaitIncrement());
+    Assert.assertEquals(logInterval, retry.getLogInterval());
   }
 
 }
diff --git a/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryTest.java b/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryTest.java
index 6bbd1ff..fddef47 100644
--- a/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryTest.java
+++ b/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryTest.java
@@ -20,6 +20,7 @@ import org.easymock.EasyMock;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
+import org.slf4j.Logger;
 
 /**
  *
@@ -27,15 +28,15 @@ import org.junit.Test;
 public class RetryTest {
 
   private Retry retry;
-  long initialWait = 1000l, waitIncrement = 1000l, maxRetries = 5;
+  long initialWait = 1000l, waitIncrement = 1000l, maxRetries = 5, logInterval = 1000l;
   private Retry unlimitedRetry1;
   private Retry unlimitedRetry2;
 
   @Before
   public void setup() {
-    retry = new Retry(maxRetries, initialWait, waitIncrement, maxRetries * 1000l);
-    unlimitedRetry1 = new Retry(initialWait, waitIncrement, maxRetries * 1000l);
-    unlimitedRetry2 = new Retry(-10, initialWait, waitIncrement, maxRetries * 1000l);
+    retry = new Retry(maxRetries, initialWait, waitIncrement, maxRetries * 1000l, logInterval);
+    unlimitedRetry1 = new Retry(initialWait, waitIncrement, maxRetries * 1000l, logInterval);
+    unlimitedRetry2 = new Retry(-10, initialWait, waitIncrement, maxRetries * 1000l, logInterval);
   }
 
   @Test
@@ -147,4 +148,27 @@ public class RetryTest {
       unlimitedRetry2.useRetry();
     }
   }
+
+  @Test
+  public void testLogging() {
+    Logger testLogger = EasyMock.createMock(Logger.class);
+    testLogger.warn(EasyMock.anyObject(String.class));
+    EasyMock.expectLastCall().times(4, 6);
+    EasyMock.replay(testLogger);
+
+    // we want to do this for 5 second and observe the log messages
+    long start = System.currentTimeMillis();
+    long end = System.currentTimeMillis();
+    int i = 0;
+    for (; (end - start < 5000l) && (i < Integer.MAX_VALUE); i++) {
+      unlimitedRetry1.logRetry(testLogger, "failure message");
+      unlimitedRetry1.useRetry();
+      end = System.currentTimeMillis();
+    }
+
+    // now observe what log messages we got which should be around 5 +- 1
+    EasyMock.verify(testLogger);
+    Assert.assertTrue(i > 10);
+
+  }
 }
diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
index f20e145..7809e2e 100644
--- a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
+++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
@@ -355,9 +355,10 @@ public class TabletServer extends AccumuloServerContext implements Runnable {
     // Tolerate `toleratedWalCreationFailures` failures, waiting `walFailureRetryIncrement` milliseconds after the first failure,
     // incrementing the next wait period by the same value, for a maximum of `walFailureRetryMax` retries.
     final RetryFactory walCreationRetryFactory = new RetryFactory(toleratedWalCreationFailures, walFailureRetryIncrement, walFailureRetryIncrement,
-        walFailureRetryMax);
+        walFailureRetryMax, RetryFactory.DEFAULT_LOG_INTERVAL);
     // Tolerate infinite failures for the write, however backing off the same as for creation failures.
-    final RetryFactory walWritingRetryFactory = new RetryFactory(walFailureRetryIncrement, walFailureRetryIncrement, walFailureRetryMax);
+    final RetryFactory walWritingRetryFactory = new RetryFactory(walFailureRetryIncrement, walFailureRetryIncrement, walFailureRetryMax,
+        RetryFactory.DEFAULT_LOG_INTERVAL);
 
     logger = new TabletServerLogger(this, walogMaxSize, syncCounter, flushCounter, walCreationRetryFactory, walWritingRetryFactory, walogMaxAge);
     this.resourceManager = new TabletServerResourceManager(this, fs);
diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
index 1487199..3843210 100644
--- a/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
+++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
@@ -338,9 +338,10 @@ public class TabletServerLogger {
           success = (currentLogSet == logSetId.get());
         }
       } catch (DfsLogger.LogClosedException ex) {
-        log.debug("Logs closed while writing, retrying attempt " + writeRetry.retriesCompleted());
+        writeRetry.logRetry(log, "Logs closed while writing");
       } catch (Exception t) {
-        log.warn("Failed to write to WAL, retrying attempt " + writeRetry.retriesCompleted(), t);
+        if (writeRetry.retriesCompleted() != 0)
+          writeRetry.logRetry(log, "Failed to write to WAL", t);
 
         try {
           // Backoff

-- 
To stop receiving notification emails like this one, please contact
ibella@apache.org.