You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ratis.apache.org by wi...@apache.org on 2023/01/13 11:43:41 UTC

[ratis] branch master updated: RATIS-1766. Add descriptions to metrics entries (#804)

This is an automated email from the ASF dual-hosted git repository.

williamsong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ratis.git


The following commit(s) were added to refs/heads/master by this push:
     new c2181e5fa RATIS-1766. Add descriptions to metrics entries (#804)
c2181e5fa is described below

commit c2181e5fab51254452db094693d5bddbda78d5ba
Author: William Song <48...@users.noreply.github.com>
AuthorDate: Fri Jan 13 19:43:35 2023 +0800

    RATIS-1766. Add descriptions to metrics entries (#804)
---
 ratis-docs/src/site/markdown/metrics.md            | 145 +++++++++++++++++++++
 .../server/metrics/SegmentedRaftLogMetrics.java    |   4 +-
 2 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/ratis-docs/src/site/markdown/metrics.md b/ratis-docs/src/site/markdown/metrics.md
new file mode 100644
index 000000000..10c78ccbb
--- /dev/null
+++ b/ratis-docs/src/site/markdown/metrics.md
@@ -0,0 +1,145 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+
+# Metrics
+
+## Ratis Server
+
+### StateMachine Metrics
+
+| Application | Component     | Name                | Type  | Description                                                  |
+|-------------|---------------|---------------------|-------|--------------------------------------------------------------|
+| ratis       | state_machine | appliedIndex        | Gauge | Applied index of state machine                               |
+| ratis       | state_machine | applyCompletedIndex | Gauge | Last log index which completely applied to the state machine |
+| ratis       | state_machine | takeSnapshot        | Timer | Time taken for state machine to take a snapshot              |
+
+
+### Leader Election Metrics
+
+| Application | Component       | Name                          | Type    | Description                                           |
+|-------------|-----------------|-------------------------------|---------|-------------------------------------------------------|
+| ratis       | leader_election | electionCount                 | Counter | Number of leader elections of this group              |
+| ratis       | leader_election | timeoutCount                  | Counter | Number of election timeouts of this peer              |
+| ratis       | leader_election | electionTime                  | Timer   | Time spent on leader election                         |
+| ratis       | leader_election | lastLeaderElapsedTime         | Gauge   | Time elapsed since last hearing from an active leader |
+| ratis       | leader_election | transferLeadershipCount       | Counter | Number of transferLeader requests                     |
+| ratis       | leader_election | lastLeaderElectionElapsedTime | Gauge   | Time elapsed since last leader election               |
+
+### Log Appender Metrics
+
+| Application | Component    | Name                              | Type  | Description                                 |
+|-------------|--------------|-----------------------------------|-------|---------------------------------------------|
+| ratis       | log_appender | follower_{peer}_next_index        | Gauge | Next index of peer                          |
+| ratis       | log_appender | follower_{peer}_match_index       | Gauge | Match index of peer                         |
+| ratis       | log_appender | follower_{peer}_rpc_response_time | Gauge | Time elapsed since peer's last rpc response |
+
+### Raft Log Metrics
+
+| Application | Component  | Name                            | Type    | Description                                                                                                   |
+|-------------|------------|---------------------------------|---------|---------------------------------------------------------------------------------------------------------------|
+| ratis       | log_worker | metadataLogEntryCount           | Counter | Number of metadata(term-index) log entries                                                                    |
+| ratis       | log_worker | configLogEntryCount             | Counter | Number of configuration log entries                                                                           |
+| ratis       | log_worker | stateMachineLogEntryCount       | Counter | Number of statemachine log entries                                                                            |
+| ratis       | log_worker | flushTime                       | Timer   | Time taken to flush log                                                                                       |
+| ratis       | log_worker | flushCount                      | Counter | Number of times of log-flush invoked                                                                          |
+| ratis       | log_worker | syncTime                        | Timer   | Time taken to log sync (fsync)                                                                                |
+| ratis       | log_worker | dataQueueSize                   | Gauge   | Raft log data queue size which at any time gives the number of log related operations in the queue            |
+| ratis       | log_worker | workerQueueSize                 | Gauge   | Raft log worker queue size which at any time gives number of committed entries that are to be synced          |
+| ratis       | log_worker | syncBatchSize                   | Gauge   | Number of raft log entries synced in each flush call                                                          |
+| ratis       | log_worker | cacheMissCount                  | Counter | Count of RaftLogCache Misses                                                                                  |
+| ratis       | log_worker | cacheHitCount                   | Counter | Count of RaftLogCache Hits                                                                                    |
+| ratis       | log_worker | closedSegmentsNum               | Gauge   | Number of closed raft log segments                                                                            |
+| ratis       | log_worker | closedSegmentsSizeInBytes       | Gauge   | Size of closed raft log segments in bytes                                                                     |
+| ratis       | log_worker | openSegmentSizeInBytes          | Gauge   | Size of open raft log segment in bytes                                                                        |
+| ratis       | log_worker | appendEntryLatency              | Timer   | Total time taken to append a raft log entry                                                                   |
+| ratis       | log_worker | enqueuedTime                    | Timer   | Time spent by a Raft log operation in the queue                                                               |
+| ratis       | log_worker | queueingDelay                   | Timer   | Time taken for a Raft log operation to get into the queue after being requested, waiting queue to be non-full |
+| ratis       | log_worker | {operation}ExecutionTime        | Timer   | Time taken for a Raft log operation(open/close/flush/write/purge) to complete execution                       |
+| ratis       | log_worker | appendEntryCount                | Counter | Number of entries appended to the raft log                                                                    |
+| ratis       | log_worker | purgeLog                        | Timer   | Time taken for Raft log purge operation to complete execution                                                 |
+| ratis       | log_worker | numStateMachineDataWriteTimeout | Counter | Number of statemachine dataApi write timeouts                                                                 |
+| ratis       | log_worker | numStateMachineDataReadTimeout  | Counter | Number of statemachine dataApi read timeouts                                                                  |
+| ratis       | log_worker | readEntryLatency                | Timer   | Time required to read a raft log entry from actual raft log file and create a raft log entry                  |
+| ratis       | log_worker | segmentLoadLatency              | Timer   | Time required to load and process raft log segments during restart                                            |
+
+
+### Raft Server Metrics
+
+| Application | Component | Name                             | Type    | Description                                                         |
+|-------------|-----------|----------------------------------|---------|---------------------------------------------------------------------|
+| ratis       | server    | {peer}_lastHeartbeatElapsedTime  | Gauge   | Time elapsed since last heartbeat rpc response                      |
+| ratis       | server    | follower_append_entry_latency    | Timer   | Time taken for followers to append log entries                      |
+| ratis       | server    | {peer}_peerCommitIndex           | Gauge   | Commit index of peer                                                |
+| ratis       | server    | clientReadRequest                | Timer   | Time taken to process read requests from client                     |
+| ratis       | server    | clientStaleReadRequest           | Timer   | Time taken to process stale-read requests from client               |
+| ratis       | server    | clientWriteRequest               | Timer   | Time taken to process write requests from client                    |
+| ratis       | server    | clientWatch{level}Request        | Timer   | Time taken to process watch(replication_level) requests from client |
+| ratis       | server    | numRequestQueueLimitHits         | Counter | Number of (total client requests in queue) limit hits               |
+| ratis       | server    | numRequestsByteSizeLimitHits     | Counter | Number of (total size of client requests in queue) limit hits       |
+| ratis       | server    | numResourceLimitHits             | Counter | Sum of numRequestQueueLimitHits and numRequestsByteSizeLimitHits    |
+| ratis       | server    | numPendingRequestInQueue         | Gauge   | Number of pending client requests in queue                          |
+| ratis       | server    | numPendingRequestMegaByteSize    | Gauge   | Total size of pending client requests in queue                      |
+| ratis       | server    | retryCacheEntryCount             | Gauge   | Number of entries in retry cache                                    |
+| ratis       | server    | retryCacheHitCount               | Gauge   | Number of retry cache hits                                          |
+| ratis       | server    | retryCacheHitRate                | Gauge   | Retry cache hit rate                                                |
+| ratis       | server    | retryCacheMissCount              | Gauge   | Number of retry cache misses                                        |
+| ratis       | server    | retryCacheMissRate               | Gauge   | Retry cache miss rate                                               |
+| ratis       | server    | numFailedClientStaleReadOnServer | Counter | Number of failed stale-read requests                                |
+| ratis       | server    | numFailedClientReadOnServer      | Counter | Number of failed read requests                                      |
+| ratis       | server    | numFailedClientWriteOnServer     | Counter | Number of failed write requests                                     |
+| ratis       | server    | numFailedClientWatchOnServer     | Counter | Number of failed watch requests                                     |
+| ratis       | server    | numFailedClientStreamOnServer    | Counter | Number of failed stream requests                                    |
+| ratis       | server    | numInstallSnapshot               | Counter | Number of install-snapshot requests                                 |
+
+
+## Ratis Netty Metrics
+
+| Application | Component     | Name                          | Type    | Description                               |
+|-------------|---------------|-------------------------------|---------|-------------------------------------------|
+| ratis_netty | stream_server | {request}_latency             | timer   | Time taken to process data stream request |
+| ratis_netty | stream_server | {request}_success_reply_count | Counter | Number of success replies of request      |
+| ratis_netty | stream_server | {request}_fail_reply_count    | Counter | Number of fail replies of request         |
+| ratis_netty | stream_server | num_requests_{request}        | Counter | Number of total data stream requests      |
+
+## Ratis gRPC Metrics
+
+### Message Metrics
+
+| Application | Component              | Name                       | Type    | Description                                      |
+|-------------|------------------------|----------------------------|---------|--------------------------------------------------|
+| ratis       | client_message_metrics | {method}_started_total     | Counter | total messages started of {method}               |
+| ratis       | client_message_metrics | {method}_completed_total   | Counter | total messages completed of {method}             |
+| ratis       | client_message_metrics | {method}_received_executed | Counter | total messages received and executed of {method} |
+| ratis       | server_message_metrics | {method}_started_total     | Counter | total messages started of {method}               |
+| ratis       | server_message_metrics | {method}_completed_total   | Counter | total messages completed of {method}             |
+| ratis       | server_message_metrics | {method}_received_executed | Counter | total messages received and executed of {method} |
+
+### gRPC Log Appender Metrics
+
+
+| Application | Component    | Name                                  | Type    | Description                                 |
+|-------------|--------------|---------------------------------------|---------|---------------------------------------------|
+| ratis_grpc  | log_appender | {appendEntries}_latency               | Timer   | Latency of method (appendEntries/heartbeat) |
+| ratis_grpc  | log_appender | {follower}_success_reply_count        | Counter | Number of success replies                   |
+| ratis_grpc  | log_appender | {follower}_not_leader_reply_count     | Counter | Number of NotLeader replies                 |
+| ratis_grpc  | log_appender | {follower}_inconsistency_reply_count  | Counter | Number of Inconsistency replies             |
+| ratis_grpc  | log_appender | {follower}_append_entry_timeout_count | Counter | Number of appendEntries timeouts            |
+| ratis_grpc  | log_appender | {follower}_pending_log_requests_count | Counter | Number of pending requests                  |
+| ratis_grpc  | log_appender | num_retries                           | Counter | Number of request retries                   |
+| ratis_grpc  | log_appender | num_requests                          | Counter | Number of requests in total                 |
+| ratis_grpc  | log_appender | num_install_snapshot                  | Counter | Number of install snapshot requests         |
diff --git a/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java b/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java
index a6a1af0ac..865bdcde9 100644
--- a/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java
+++ b/ratis-server/src/main/java/org/apache/ratis/server/metrics/SegmentedRaftLogMetrics.java
@@ -68,9 +68,9 @@ public class SegmentedRaftLogMetrics extends RaftLogMetricsBase {
   /** Number of entries appended to the raft log */
   public static final String RAFT_LOG_APPEND_ENTRY_COUNT = "appendEntryCount";
   public static final String RAFT_LOG_PURGE_METRIC = "purgeLog";
-  /** Time taken for a Raft log operation to complete write state machine data. */
+  /** Number of statemachine dataApi write timeouts */
   public static final String RAFT_LOG_STATEMACHINE_DATA_WRITE_TIMEOUT_COUNT = "numStateMachineDataWriteTimeout";
-  /** Time taken for a Raft log operation to complete read state machine data. */
+  /** Number of statemachine dataApi read timeouts */
   public static final String RAFT_LOG_STATEMACHINE_DATA_READ_TIMEOUT_COUNT = "numStateMachineDataReadTimeout";
 
   //////////////////////////////