You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by ib...@apache.org on 2018/02/01 23:26:57 UTC
[accumulo] branch 1.7 updated: ACCUMULO-4777 Refine retry logging
This is an automated email from the ASF dual-hosted git repository.
ibella pushed a commit to branch 1.7
in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/1.7 by this push:
new b52db47 ACCUMULO-4777 Refine retry logging
b52db47 is described below
commit b52db479e9c4e371a94b1f019e818622562e3dc3
Author: Ivan Bella <iv...@bella.name>
AuthorDate: Fri Jan 26 09:31:11 2018 -0500
ACCUMULO-4777 Refine retry logging
* Reverted to only log the warning message after attempt 1.
* Fixed log messages to display 1 based attempt number.
* Avoid logging warning on every retry.
---
.../org/apache/accumulo/fate/zookeeper/Retry.java | 60 ++++++++++++++++++++--
.../accumulo/fate/zookeeper/RetryFactory.java | 17 ++++--
.../accumulo/fate/zookeeper/RetryFactoryTest.java | 10 ++--
.../apache/accumulo/fate/zookeeper/RetryTest.java | 32 ++++++++++--
.../org/apache/accumulo/tserver/TabletServer.java | 5 +-
.../accumulo/tserver/log/TabletServerLogger.java | 5 +-
6 files changed, 108 insertions(+), 21 deletions(-)
diff --git a/fate/src/main/java/org/apache/accumulo/fate/zookeeper/Retry.java b/fate/src/main/java/org/apache/accumulo/fate/zookeeper/Retry.java
index 1f55d72..d710e2c 100644
--- a/fate/src/main/java/org/apache/accumulo/fate/zookeeper/Retry.java
+++ b/fate/src/main/java/org/apache/accumulo/fate/zookeeper/Retry.java
@@ -19,6 +19,8 @@ package org.apache.accumulo.fate.zookeeper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.google.common.annotations.VisibleForTesting;
+
/**
* Encapsulates the retrying implementation for some operation. Provides bounded retry attempts with a bounded, linear backoff.
*/
@@ -30,6 +32,9 @@ public class Retry {
private long maxRetries, maxWait, waitIncrement;
private long retriesDone, currentWait;
+ private long logInterval;
+ private long lastRetryLog;
+
/**
* @param maxRetries
* Maximum times to retry or MAX_RETRY_DISABLED if no maximum
@@ -39,13 +44,18 @@ public class Retry {
* The maximum wait (ms)
* @param waitIncrement
* The amount of time (ms) to increment next wait time by
+ * @param logInterval
+ * The amount of time (ms) between logging retries
*/
- public Retry(long maxRetries, long startWait, long waitIncrement, long maxWait) {
+
+ public Retry(long maxRetries, long startWait, long waitIncrement, long maxWait, long logInterval) {
this.maxRetries = maxRetries;
this.maxWait = maxWait;
this.waitIncrement = waitIncrement;
this.retriesDone = 0l;
this.currentWait = startWait;
+ this.logInterval = logInterval;
+ this.lastRetryLog = -1;
}
/**
@@ -55,47 +65,57 @@ public class Retry {
* The maximum wait (ms)
* @param waitIncrement
* The amount of time (ms) to increment next wait time by
+ * @param logInterval
+ * The amount of time (ms) between logging retries
*/
- public Retry(long startWait, long waitIncrement, long maxWait) {
- this(MAX_RETRY_DISABLED, startWait, waitIncrement, maxWait);
+ public Retry(long startWait, long waitIncrement, long maxWait, long logInterval) {
+ this(MAX_RETRY_DISABLED, startWait, waitIncrement, maxWait, logInterval);
}
// Visible for testing
+ @VisibleForTesting
long getMaxRetries() {
return maxRetries;
}
// Visible for testing
+ @VisibleForTesting
long getCurrentWait() {
return currentWait;
}
// Visible for testing
+ @VisibleForTesting
long getWaitIncrement() {
return waitIncrement;
}
// Visible for testing
+ @VisibleForTesting
long getMaxWait() {
return maxWait;
}
// Visible for testing
+ @VisibleForTesting
void setMaxRetries(long maxRetries) {
this.maxRetries = maxRetries;
}
// Visible for testing
+ @VisibleForTesting
void setStartWait(long startWait) {
this.currentWait = startWait;
}
// Visible for testing
+ @VisibleForTesting
void setWaitIncrement(long waitIncrement) {
this.waitIncrement = waitIncrement;
}
// Visible for testing
+ @VisibleForTesting
void setMaxWait(long maxWait) {
this.maxWait = maxWait;
}
@@ -104,6 +124,15 @@ public class Retry {
return maxRetries < 0;
}
+ // Visible for testing
+ void setLogInterval(long logInterval) {
+ this.logInterval = logInterval;
+ }
+
+ public long getLogInterval() {
+ return logInterval;
+ }
+
public boolean canRetry() {
return isMaxRetryDisabled() || (retriesDone < maxRetries);
}
@@ -125,7 +154,9 @@ public class Retry {
}
public void waitForNextAttempt() throws InterruptedException {
- log.debug("Sleeping for " + currentWait + "ms before retrying operation");
+ if (log.isDebugEnabled()) {
+ log.debug("Sleeping for " + currentWait + "ms before retrying operation");
+ }
sleep(currentWait);
currentWait = Math.min(maxWait, currentWait + waitIncrement);
}
@@ -133,4 +164,25 @@ public class Retry {
protected void sleep(long wait) throws InterruptedException {
Thread.sleep(wait);
}
+
+ public void logRetry(Logger log, String message, Throwable t) {
+ // log the first time, and then after every logInterval
+ if (lastRetryLog < 0 || (System.currentTimeMillis() - lastRetryLog) > logInterval) {
+ log.warn(getMessage(message), t);
+ lastRetryLog = System.currentTimeMillis();
+ }
+ }
+
+ public void logRetry(Logger log, String message) {
+ // log the first time, and then after every logInterval
+ if (lastRetryLog < 0 || (System.currentTimeMillis() - lastRetryLog) > logInterval) {
+ log.warn(getMessage(message));
+ lastRetryLog = System.currentTimeMillis();
+ }
+ }
+
+ private String getMessage(String message) {
+ return message + ", retrying attempt " + (retriesDone + 1) + " (suppressing retry messages for " + logInterval + "ms)";
+ }
+
}
diff --git a/fate/src/main/java/org/apache/accumulo/fate/zookeeper/RetryFactory.java b/fate/src/main/java/org/apache/accumulo/fate/zookeeper/RetryFactory.java
index aa6da20..e56eddc 100644
--- a/fate/src/main/java/org/apache/accumulo/fate/zookeeper/RetryFactory.java
+++ b/fate/src/main/java/org/apache/accumulo/fate/zookeeper/RetryFactory.java
@@ -21,9 +21,12 @@ package org.apache.accumulo.fate.zookeeper;
*/
public class RetryFactory {
public static final long DEFAULT_MAX_RETRIES = 10l, DEFAULT_START_WAIT = 250l, DEFAULT_WAIT_INCREMENT = 250l, DEFAULT_MAX_WAIT = 5000l;
- public static final RetryFactory DEFAULT_INSTANCE = new RetryFactory(DEFAULT_MAX_RETRIES, DEFAULT_START_WAIT, DEFAULT_WAIT_INCREMENT, DEFAULT_MAX_WAIT);
+ // default logging interval: 3 minutes
+ public static final long DEFAULT_LOG_INTERVAL = 3 * 60 * 1000;
+ public static final RetryFactory DEFAULT_INSTANCE = new RetryFactory(DEFAULT_MAX_RETRIES, DEFAULT_START_WAIT, DEFAULT_WAIT_INCREMENT, DEFAULT_MAX_WAIT,
+ DEFAULT_LOG_INTERVAL);
- private final long maxRetries, startWait, maxWait, waitIncrement;
+ private final long maxRetries, startWait, maxWait, waitIncrement, logInterval;
/**
* Create a retry factor for retries with a limit
@@ -36,12 +39,15 @@ public class RetryFactory {
* The amount of ms to increment the wait on subsequent retries
* @param maxWait
* The max amount of wait time between retries
+ * @param logInterval
+ * The amount of time (ms) between logging retries
*/
- public RetryFactory(long maxRetries, long startWait, long waitIncrement, long maxWait) {
+ public RetryFactory(long maxRetries, long startWait, long waitIncrement, long maxWait, long logInterval) {
this.maxRetries = maxRetries;
this.startWait = startWait;
this.maxWait = maxWait;
this.waitIncrement = waitIncrement;
+ this.logInterval = logInterval;
}
/**
@@ -54,14 +60,15 @@ public class RetryFactory {
* @param maxWait
* The max amount of wait time between retries
*/
- public RetryFactory(long startWait, long waitIncrement, long maxWait) {
+ public RetryFactory(long startWait, long waitIncrement, long maxWait, long logInterval) {
this.maxRetries = Retry.MAX_RETRY_DISABLED;
this.startWait = startWait;
this.maxWait = maxWait;
this.waitIncrement = waitIncrement;
+ this.logInterval = logInterval;
}
public Retry create() {
- return new Retry(maxRetries, startWait, waitIncrement, maxWait);
+ return new Retry(maxRetries, startWait, waitIncrement, maxWait, logInterval);
}
}
diff --git a/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryFactoryTest.java b/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryFactoryTest.java
index 9ba19a4..cc5d4bc 100644
--- a/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryFactoryTest.java
+++ b/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryFactoryTest.java
@@ -26,26 +26,28 @@ public class RetryFactoryTest {
@Test
public void properArgumentsInRetry() {
- long maxRetries = 10, startWait = 50l, maxWait = 5000l, waitIncrement = 500l;
- RetryFactory factory = new RetryFactory(maxRetries, startWait, waitIncrement, maxWait);
+ long maxRetries = 10, startWait = 50l, maxWait = 5000l, waitIncrement = 500l, logInterval = 10000l;
+ RetryFactory factory = new RetryFactory(maxRetries, startWait, waitIncrement, maxWait, logInterval);
Retry retry = factory.create();
Assert.assertEquals(maxRetries, retry.getMaxRetries());
Assert.assertEquals(startWait, retry.getCurrentWait());
Assert.assertEquals(maxWait, retry.getMaxWait());
Assert.assertEquals(waitIncrement, retry.getWaitIncrement());
+ Assert.assertEquals(logInterval, retry.getLogInterval());
}
@Test
public void properArgumentsInUnlimitedRetry() {
- long startWait = 50l, maxWait = 5000l, waitIncrement = 500l;
- RetryFactory factory = new RetryFactory(startWait, waitIncrement, maxWait);
+ long startWait = 50l, maxWait = 5000l, waitIncrement = 500l, logInterval = 10000l;
+ RetryFactory factory = new RetryFactory(startWait, waitIncrement, maxWait, logInterval);
Retry retry = factory.create();
Assert.assertEquals(Retry.MAX_RETRY_DISABLED, retry.getMaxRetries());
Assert.assertEquals(startWait, retry.getCurrentWait());
Assert.assertEquals(maxWait, retry.getMaxWait());
Assert.assertEquals(waitIncrement, retry.getWaitIncrement());
+ Assert.assertEquals(logInterval, retry.getLogInterval());
}
}
diff --git a/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryTest.java b/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryTest.java
index 6bbd1ff..fddef47 100644
--- a/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryTest.java
+++ b/fate/src/test/java/org/apache/accumulo/fate/zookeeper/RetryTest.java
@@ -20,6 +20,7 @@ import org.easymock.EasyMock;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
+import org.slf4j.Logger;
/**
*
@@ -27,15 +28,15 @@ import org.junit.Test;
public class RetryTest {
private Retry retry;
- long initialWait = 1000l, waitIncrement = 1000l, maxRetries = 5;
+ long initialWait = 1000l, waitIncrement = 1000l, maxRetries = 5, logInterval = 1000l;
private Retry unlimitedRetry1;
private Retry unlimitedRetry2;
@Before
public void setup() {
- retry = new Retry(maxRetries, initialWait, waitIncrement, maxRetries * 1000l);
- unlimitedRetry1 = new Retry(initialWait, waitIncrement, maxRetries * 1000l);
- unlimitedRetry2 = new Retry(-10, initialWait, waitIncrement, maxRetries * 1000l);
+ retry = new Retry(maxRetries, initialWait, waitIncrement, maxRetries * 1000l, logInterval);
+ unlimitedRetry1 = new Retry(initialWait, waitIncrement, maxRetries * 1000l, logInterval);
+ unlimitedRetry2 = new Retry(-10, initialWait, waitIncrement, maxRetries * 1000l, logInterval);
}
@Test
@@ -147,4 +148,27 @@ public class RetryTest {
unlimitedRetry2.useRetry();
}
}
+
+ @Test
+ public void testLogging() {
+ Logger testLogger = EasyMock.createMock(Logger.class);
+ testLogger.warn(EasyMock.anyObject(String.class));
+ EasyMock.expectLastCall().times(4, 6);
+ EasyMock.replay(testLogger);
+
+ // we want to do this for 5 second and observe the log messages
+ long start = System.currentTimeMillis();
+ long end = System.currentTimeMillis();
+ int i = 0;
+ for (; (end - start < 5000l) && (i < Integer.MAX_VALUE); i++) {
+ unlimitedRetry1.logRetry(testLogger, "failure message");
+ unlimitedRetry1.useRetry();
+ end = System.currentTimeMillis();
+ }
+
+ // now observe what log messages we got which should be around 5 +- 1
+ EasyMock.verify(testLogger);
+ Assert.assertTrue(i > 10);
+
+ }
}
diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
index f20e145..7809e2e 100644
--- a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
+++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
@@ -355,9 +355,10 @@ public class TabletServer extends AccumuloServerContext implements Runnable {
// Tolerate `toleratedWalCreationFailures` failures, waiting `walFailureRetryIncrement` milliseconds after the first failure,
// incrementing the next wait period by the same value, for a maximum of `walFailureRetryMax` retries.
final RetryFactory walCreationRetryFactory = new RetryFactory(toleratedWalCreationFailures, walFailureRetryIncrement, walFailureRetryIncrement,
- walFailureRetryMax);
+ walFailureRetryMax, RetryFactory.DEFAULT_LOG_INTERVAL);
// Tolerate infinite failures for the write, however backing off the same as for creation failures.
- final RetryFactory walWritingRetryFactory = new RetryFactory(walFailureRetryIncrement, walFailureRetryIncrement, walFailureRetryMax);
+ final RetryFactory walWritingRetryFactory = new RetryFactory(walFailureRetryIncrement, walFailureRetryIncrement, walFailureRetryMax,
+ RetryFactory.DEFAULT_LOG_INTERVAL);
logger = new TabletServerLogger(this, walogMaxSize, syncCounter, flushCounter, walCreationRetryFactory, walWritingRetryFactory, walogMaxAge);
this.resourceManager = new TabletServerResourceManager(this, fs);
diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
index 1487199..3843210 100644
--- a/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
+++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
@@ -338,9 +338,10 @@ public class TabletServerLogger {
success = (currentLogSet == logSetId.get());
}
} catch (DfsLogger.LogClosedException ex) {
- log.debug("Logs closed while writing, retrying attempt " + writeRetry.retriesCompleted());
+ writeRetry.logRetry(log, "Logs closed while writing");
} catch (Exception t) {
- log.warn("Failed to write to WAL, retrying attempt " + writeRetry.retriesCompleted(), t);
+ if (writeRetry.retriesCompleted() != 0)
+ writeRetry.logRetry(log, "Failed to write to WAL", t);
try {
// Backoff
--
To stop receiving notification emails like this one, please contact
ibella@apache.org.