You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ratis.apache.org by sz...@apache.org on 2021/12/10 09:50:54 UTC
[ratis] branch master updated: RATIS-1408. Add metrics of statemachine read/write timeout count. (#506)
This is an automated email from the ASF dual-hosted git repository.
szetszwo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ratis.git
The following commit(s) were added to refs/heads/master by this push:
new a7555cf RATIS-1408. Add metrics of statemachine read/write timeout count. (#506)
a7555cf is described below
commit a7555cfc143093aba76b4b148e890406b25241f5
Author: Sammi Chen <sa...@apache.org>
AuthorDate: Fri Dec 10 17:49:42 2021 +0800
RATIS-1408. Add metrics of statemachine read/write timeout count. (#506)
---
.../java/org/apache/ratis/server/RaftServerConfigKeys.java | 10 ++++++++++
.../org/apache/ratis/server/metrics/RaftLogMetrics.java | 8 ++++++++
.../org/apache/ratis/server/leader/LogAppenderBase.java | 3 +--
.../ratis/server/metrics/SegmentedRaftLogMetrics.java | 13 +++++++++++++
.../java/org/apache/ratis/server/raftlog/RaftLogBase.java | 7 +++++--
.../server/raftlog/segmented/SegmentedRaftLogWorker.java | 7 +++++--
6 files changed, 42 insertions(+), 6 deletions(-)
diff --git a/ratis-server-api/src/main/java/org/apache/ratis/server/RaftServerConfigKeys.java b/ratis-server-api/src/main/java/org/apache/ratis/server/RaftServerConfigKeys.java
index f27fda2..3bf56a7 100644
--- a/ratis-server-api/src/main/java/org/apache/ratis/server/RaftServerConfigKeys.java
+++ b/ratis-server-api/src/main/java/org/apache/ratis/server/RaftServerConfigKeys.java
@@ -360,6 +360,16 @@ public interface RaftServerConfigKeys {
static void setSyncTimeoutRetry(RaftProperties properties, int syncTimeoutRetry) {
setInt(properties::setInt, SYNC_TIMEOUT_RETRY_KEY, syncTimeoutRetry, requireMin(-1));
}
+
+ String READ_TIMEOUT_KEY = PREFIX + ".read.timeout";
+ TimeDuration READ_TIMEOUT_DEFAULT = TimeDuration.valueOf(1000, TimeUnit.MILLISECONDS);
+ static TimeDuration readTimeout(RaftProperties properties) {
+ return getTimeDuration(properties.getTimeDuration(READ_TIMEOUT_DEFAULT.getUnit()),
+ READ_TIMEOUT_KEY, READ_TIMEOUT_DEFAULT, getDefaultLog());
+ }
+ static void setReadTimeout(RaftProperties properties, TimeDuration readTimeout) {
+ setTimeDuration(properties::setTimeDuration, READ_TIMEOUT_KEY, readTimeout);
+ }
}
interface Appender {
diff --git a/ratis-server-api/src/main/java/org/apache/ratis/server/metrics/RaftLogMetrics.java b/ratis-server-api/src/main/java/org/apache/ratis/server/metrics/RaftLogMetrics.java
index 42c1605..4899e03 100644
--- a/ratis-server-api/src/main/java/org/apache/ratis/server/metrics/RaftLogMetrics.java
+++ b/ratis-server-api/src/main/java/org/apache/ratis/server/metrics/RaftLogMetrics.java
@@ -23,4 +23,12 @@ import org.apache.ratis.server.raftlog.LogEntryHeader;
public interface RaftLogMetrics {
/** A log entry just has been committed. */
void onLogEntryCommitted(LogEntryHeader header);
+
+ /** Read statemachine data timeout */
+ default void onStateMachineDataReadTimeout() {
+ }
+
+ /** Write statemachine data timeout */
+ default void onStateMachineDataWriteTimeout() {
+ }
}
diff --git a/ratis-server/src/main/java/org/apache/ratis/server/leader/LogAppenderBase.java b/ratis-server/src/main/java/org/apache/ratis/server/leader/LogAppenderBase.java
index 28ce1eb..c2ca24b 100644
--- a/ratis-server/src/main/java/org/apache/ratis/server/leader/LogAppenderBase.java
+++ b/ratis-server/src/main/java/org/apache/ratis/server/leader/LogAppenderBase.java
@@ -148,8 +148,7 @@ public abstract class LogAppenderBase implements LogAppender {
}
final List<LogEntryProto> protos = buffer.pollList(getHeartbeatWaitTimeMs(), EntryWithData::getEntry,
- (entry, time, exception) -> LOG.warn("{}: Failed to get {} in {}: {}",
- follower.getName(), entry, time, exception));
+ (entry, time, exception) -> LOG.warn("Failed to get {} in {}: {}", entry, time, exception));
buffer.clear();
assertProtos(protos, followerNext, previous, snapshotIndex);
return leaderState.newAppendEntriesRequestProto(follower, protos, previous, callId);
diff --git a/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java b/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java
index e513d94..f359e1a 100644
--- a/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java
+++ b/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java
@@ -67,6 +67,10 @@ public class SegmentedRaftLogMetrics extends RaftLogMetricsBase {
/** Number of entries appended to the raft log */
public static final String RAFT_LOG_APPEND_ENTRY_COUNT = "appendEntryCount";
public static final String RAFT_LOG_PURGE_METRIC = "purgeLog";
+ /** Time taken for a Raft log operation to complete write state machine data. */
+ public static final String RAFT_LOG_STATEMACHINE_DATA_WRITE_TIMEOUT_COUNT = "numStateMachineDataWriteTimeout";
+ /** Time taken for a Raft log operation to complete read state machine data. */
+ public static final String RAFT_LOG_STATEMACHINE_DATA_READ_TIMEOUT_COUNT = "numStateMachineDataReadTimeout";
//////////////////////////////
// Raft Log Read Path Metrics
@@ -165,4 +169,13 @@ public class SegmentedRaftLogMetrics extends RaftLogMetricsBase {
public Timer getRaftLogPurgeTimer() {
return getTimer(RAFT_LOG_PURGE_METRIC);
}
+
+ public void onStateMachineDataWriteTimeout() {
+ registry.counter(RAFT_LOG_STATEMACHINE_DATA_WRITE_TIMEOUT_COUNT).inc();
+ }
+
+ @Override
+ public void onStateMachineDataReadTimeout() {
+ registry.counter(RAFT_LOG_STATEMACHINE_DATA_READ_TIMEOUT_COUNT).inc();
+ }
}
diff --git a/ratis-server/src/main/java/org/apache/ratis/server/raftlog/RaftLogBase.java b/ratis-server/src/main/java/org/apache/ratis/server/raftlog/RaftLogBase.java
index 3e06963..f075757 100644
--- a/ratis-server/src/main/java/org/apache/ratis/server/raftlog/RaftLogBase.java
+++ b/ratis-server/src/main/java/org/apache/ratis/server/raftlog/RaftLogBase.java
@@ -75,6 +75,7 @@ public abstract class RaftLogBase implements RaftLog {
private final Runner runner = new Runner(this::getName);
private final OpenCloseState state;
private final LongSupplier getSnapshotIndexFromStateMachine;
+ private final TimeDuration stateMachineDataReadTimeout;
private volatile LogEntryProto lastMetadataEntry = null;
@@ -91,6 +92,7 @@ public abstract class RaftLogBase implements RaftLog {
this.maxBufferSize = RaftServerConfigKeys.Log.Appender.bufferByteLimit(properties).getSizeInt();
this.state = new OpenCloseState(getName());
this.getSnapshotIndexFromStateMachine = getSnapshotIndexFromStateMachine;
+ this.stateMachineDataReadTimeout = RaftServerConfigKeys.Log.StateMachineData.readTimeout(properties);
}
@Override
@@ -392,8 +394,9 @@ public abstract class RaftLogBase implements RaftLog {
entryProto = future.thenApply(data -> LogProtoUtils.addStateMachineData(data, logEntry))
.get(timeout.getDuration(), timeout.getUnit());
} catch (TimeoutException t) {
- final String err = getName() + ": Timeout readStateMachineData for " + toLogEntryString(logEntry);
- LOG.error(err, t);
+ if (timeout.compareTo(stateMachineDataReadTimeout) > 0) {
+ getRaftLogMetrics().onStateMachineDataReadTimeout();
+ }
throw t;
} catch (Exception e) {
final String err = getName() + ": Failed readStateMachineData for " + toLogEntryString(logEntry);
diff --git a/ratis-server/src/main/java/org/apache/ratis/server/raftlog/segmented/SegmentedRaftLogWorker.java b/ratis-server/src/main/java/org/apache/ratis/server/raftlog/segmented/SegmentedRaftLogWorker.java
index d184ebc..fc8f61e 100644
--- a/ratis-server/src/main/java/org/apache/ratis/server/raftlog/segmented/SegmentedRaftLogWorker.java
+++ b/ratis-server/src/main/java/org/apache/ratis/server/raftlog/segmented/SegmentedRaftLogWorker.java
@@ -68,11 +68,13 @@ class SegmentedRaftLogWorker {
private final boolean sync;
private final TimeDuration syncTimeout;
private final int syncTimeoutRetry;
+ private final SegmentedRaftLogMetrics metrics;
- StateMachineDataPolicy(RaftProperties properties) {
+ StateMachineDataPolicy(RaftProperties properties, SegmentedRaftLogMetrics metricRegistry) {
this.sync = RaftServerConfigKeys.Log.StateMachineData.sync(properties);
this.syncTimeout = RaftServerConfigKeys.Log.StateMachineData.syncTimeout(properties);
this.syncTimeoutRetry = RaftServerConfigKeys.Log.StateMachineData.syncTimeoutRetry(properties);
+ this.metrics = metricRegistry;
Preconditions.assertTrue(syncTimeoutRetry >= -1);
}
@@ -90,6 +92,7 @@ class SegmentedRaftLogWorker {
} catch(TimeoutIOException e) {
LOG.warn("Timeout " + retry + (syncTimeoutRetry == -1? "/~": "/" + syncTimeoutRetry), e);
lastException = e;
+ metrics.onStateMachineDataWriteTimeout();
}
}
Objects.requireNonNull(lastException, "lastException == null");
@@ -196,7 +199,7 @@ class SegmentedRaftLogWorker {
this.forceSyncNum = RaftServerConfigKeys.Log.forceSyncNum(properties);
this.flushBatchSize = 0;
- this.stateMachineDataPolicy = new StateMachineDataPolicy(properties);
+ this.stateMachineDataPolicy = new StateMachineDataPolicy(properties, metricRegistry);
this.workerThread = new Thread(this::run, name);