You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ratis.apache.org by sz...@apache.org on 2021/12/10 09:50:54 UTC

[ratis] branch master updated: RATIS-1408. Add metrics of statemachine read/write timeout count. (#506)

This is an automated email from the ASF dual-hosted git repository.

szetszwo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ratis.git


The following commit(s) were added to refs/heads/master by this push:
     new a7555cf  RATIS-1408. Add metrics of statemachine read/write timeout count. (#506)
a7555cf is described below

commit a7555cfc143093aba76b4b148e890406b25241f5
Author: Sammi Chen <sa...@apache.org>
AuthorDate: Fri Dec 10 17:49:42 2021 +0800

    RATIS-1408. Add metrics of statemachine read/write timeout count. (#506)
---
 .../java/org/apache/ratis/server/RaftServerConfigKeys.java  | 10 ++++++++++
 .../org/apache/ratis/server/metrics/RaftLogMetrics.java     |  8 ++++++++
 .../org/apache/ratis/server/leader/LogAppenderBase.java     |  3 +--
 .../ratis/server/metrics/SegmentedRaftLogMetrics.java       | 13 +++++++++++++
 .../java/org/apache/ratis/server/raftlog/RaftLogBase.java   |  7 +++++--
 .../server/raftlog/segmented/SegmentedRaftLogWorker.java    |  7 +++++--
 6 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/ratis-server-api/src/main/java/org/apache/ratis/server/RaftServerConfigKeys.java b/ratis-server-api/src/main/java/org/apache/ratis/server/RaftServerConfigKeys.java
index f27fda2..3bf56a7 100644
--- a/ratis-server-api/src/main/java/org/apache/ratis/server/RaftServerConfigKeys.java
+++ b/ratis-server-api/src/main/java/org/apache/ratis/server/RaftServerConfigKeys.java
@@ -360,6 +360,16 @@ public interface RaftServerConfigKeys {
       static void setSyncTimeoutRetry(RaftProperties properties, int syncTimeoutRetry) {
         setInt(properties::setInt, SYNC_TIMEOUT_RETRY_KEY, syncTimeoutRetry, requireMin(-1));
       }
+
+      String READ_TIMEOUT_KEY = PREFIX + ".read.timeout";
+      TimeDuration READ_TIMEOUT_DEFAULT = TimeDuration.valueOf(1000, TimeUnit.MILLISECONDS);
+      static TimeDuration readTimeout(RaftProperties properties) {
+        return getTimeDuration(properties.getTimeDuration(READ_TIMEOUT_DEFAULT.getUnit()),
+            READ_TIMEOUT_KEY, READ_TIMEOUT_DEFAULT, getDefaultLog());
+      }
+      static void setReadTimeout(RaftProperties properties, TimeDuration readTimeout) {
+        setTimeDuration(properties::setTimeDuration, READ_TIMEOUT_KEY, readTimeout);
+      }
     }
 
     interface Appender {
diff --git a/ratis-server-api/src/main/java/org/apache/ratis/server/metrics/RaftLogMetrics.java b/ratis-server-api/src/main/java/org/apache/ratis/server/metrics/RaftLogMetrics.java
index 42c1605..4899e03 100644
--- a/ratis-server-api/src/main/java/org/apache/ratis/server/metrics/RaftLogMetrics.java
+++ b/ratis-server-api/src/main/java/org/apache/ratis/server/metrics/RaftLogMetrics.java
@@ -23,4 +23,12 @@ import org.apache.ratis.server.raftlog.LogEntryHeader;
 public interface RaftLogMetrics {
   /** A log entry just has been committed. */
   void onLogEntryCommitted(LogEntryHeader header);
+
+  /** Read statemachine data timeout */
+  default void onStateMachineDataReadTimeout() {
+  }
+
+  /** Write statemachine data timeout */
+  default void onStateMachineDataWriteTimeout() {
+  }
 }
diff --git a/ratis-server/src/main/java/org/apache/ratis/server/leader/LogAppenderBase.java b/ratis-server/src/main/java/org/apache/ratis/server/leader/LogAppenderBase.java
index 28ce1eb..c2ca24b 100644
--- a/ratis-server/src/main/java/org/apache/ratis/server/leader/LogAppenderBase.java
+++ b/ratis-server/src/main/java/org/apache/ratis/server/leader/LogAppenderBase.java
@@ -148,8 +148,7 @@ public abstract class LogAppenderBase implements LogAppender {
     }
 
     final List<LogEntryProto> protos = buffer.pollList(getHeartbeatWaitTimeMs(), EntryWithData::getEntry,
-        (entry, time, exception) -> LOG.warn("{}: Failed to get {} in {}: {}",
-            follower.getName(), entry, time, exception));
+        (entry, time, exception) -> LOG.warn("Failed to get {} in {}: {}", entry, time, exception));
     buffer.clear();
     assertProtos(protos, followerNext, previous, snapshotIndex);
     return leaderState.newAppendEntriesRequestProto(follower, protos, previous, callId);
diff --git a/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java b/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java
index e513d94..f359e1a 100644
--- a/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java
+++ b/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java
@@ -67,6 +67,10 @@ public class SegmentedRaftLogMetrics extends RaftLogMetricsBase {
   /** Number of entries appended to the raft log */
   public static final String RAFT_LOG_APPEND_ENTRY_COUNT = "appendEntryCount";
   public static final String RAFT_LOG_PURGE_METRIC = "purgeLog";
+  /** Time taken for a Raft log operation to complete write state machine data. */
+  public static final String RAFT_LOG_STATEMACHINE_DATA_WRITE_TIMEOUT_COUNT = "numStateMachineDataWriteTimeout";
+  /** Time taken for a Raft log operation to complete read state machine data. */
+  public static final String RAFT_LOG_STATEMACHINE_DATA_READ_TIMEOUT_COUNT = "numStateMachineDataReadTimeout";
 
   //////////////////////////////
   // Raft Log Read Path Metrics
@@ -165,4 +169,13 @@ public class SegmentedRaftLogMetrics extends RaftLogMetricsBase {
   public Timer getRaftLogPurgeTimer() {
     return getTimer(RAFT_LOG_PURGE_METRIC);
   }
+
+  public void onStateMachineDataWriteTimeout() {
+    registry.counter(RAFT_LOG_STATEMACHINE_DATA_WRITE_TIMEOUT_COUNT).inc();
+  }
+
+  @Override
+  public void onStateMachineDataReadTimeout() {
+    registry.counter(RAFT_LOG_STATEMACHINE_DATA_READ_TIMEOUT_COUNT).inc();
+  }
 }
diff --git a/ratis-server/src/main/java/org/apache/ratis/server/raftlog/RaftLogBase.java b/ratis-server/src/main/java/org/apache/ratis/server/raftlog/RaftLogBase.java
index 3e06963..f075757 100644
--- a/ratis-server/src/main/java/org/apache/ratis/server/raftlog/RaftLogBase.java
+++ b/ratis-server/src/main/java/org/apache/ratis/server/raftlog/RaftLogBase.java
@@ -75,6 +75,7 @@ public abstract class RaftLogBase implements RaftLog {
   private final Runner runner = new Runner(this::getName);
   private final OpenCloseState state;
   private final LongSupplier getSnapshotIndexFromStateMachine;
+  private final TimeDuration stateMachineDataReadTimeout;
 
   private volatile LogEntryProto lastMetadataEntry = null;
 
@@ -91,6 +92,7 @@ public abstract class RaftLogBase implements RaftLog {
     this.maxBufferSize = RaftServerConfigKeys.Log.Appender.bufferByteLimit(properties).getSizeInt();
     this.state = new OpenCloseState(getName());
     this.getSnapshotIndexFromStateMachine = getSnapshotIndexFromStateMachine;
+    this.stateMachineDataReadTimeout = RaftServerConfigKeys.Log.StateMachineData.readTimeout(properties);
   }
 
   @Override
@@ -392,8 +394,9 @@ public abstract class RaftLogBase implements RaftLog {
         entryProto = future.thenApply(data -> LogProtoUtils.addStateMachineData(data, logEntry))
             .get(timeout.getDuration(), timeout.getUnit());
       } catch (TimeoutException t) {
-        final String err = getName() + ": Timeout readStateMachineData for " + toLogEntryString(logEntry);
-        LOG.error(err, t);
+        if (timeout.compareTo(stateMachineDataReadTimeout) > 0) {
+          getRaftLogMetrics().onStateMachineDataReadTimeout();
+        }
         throw t;
       } catch (Exception e) {
         final String err = getName() + ": Failed readStateMachineData for " + toLogEntryString(logEntry);
diff --git a/ratis-server/src/main/java/org/apache/ratis/server/raftlog/segmented/SegmentedRaftLogWorker.java b/ratis-server/src/main/java/org/apache/ratis/server/raftlog/segmented/SegmentedRaftLogWorker.java
index d184ebc..fc8f61e 100644
--- a/ratis-server/src/main/java/org/apache/ratis/server/raftlog/segmented/SegmentedRaftLogWorker.java
+++ b/ratis-server/src/main/java/org/apache/ratis/server/raftlog/segmented/SegmentedRaftLogWorker.java
@@ -68,11 +68,13 @@ class SegmentedRaftLogWorker {
     private final boolean sync;
     private final TimeDuration syncTimeout;
     private final int syncTimeoutRetry;
+    private final SegmentedRaftLogMetrics metrics;
 
-    StateMachineDataPolicy(RaftProperties properties) {
+    StateMachineDataPolicy(RaftProperties properties, SegmentedRaftLogMetrics metricRegistry) {
       this.sync = RaftServerConfigKeys.Log.StateMachineData.sync(properties);
       this.syncTimeout = RaftServerConfigKeys.Log.StateMachineData.syncTimeout(properties);
       this.syncTimeoutRetry = RaftServerConfigKeys.Log.StateMachineData.syncTimeoutRetry(properties);
+      this.metrics = metricRegistry;
       Preconditions.assertTrue(syncTimeoutRetry >= -1);
     }
 
@@ -90,6 +92,7 @@ class SegmentedRaftLogWorker {
         } catch(TimeoutIOException e) {
           LOG.warn("Timeout " + retry + (syncTimeoutRetry == -1? "/~": "/" + syncTimeoutRetry), e);
           lastException = e;
+          metrics.onStateMachineDataWriteTimeout();
         }
       }
       Objects.requireNonNull(lastException, "lastException == null");
@@ -196,7 +199,7 @@ class SegmentedRaftLogWorker {
     this.forceSyncNum = RaftServerConfigKeys.Log.forceSyncNum(properties);
     this.flushBatchSize = 0;
 
-    this.stateMachineDataPolicy = new StateMachineDataPolicy(properties);
+    this.stateMachineDataPolicy = new StateMachineDataPolicy(properties, metricRegistry);
 
     this.workerThread = new Thread(this::run, name);