You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2020/06/24 13:29:14 UTC

[incubator-doris] branch master updated: [Bug] Enable to get TCP metrics for linux kernel 2.x (#3921)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 46c64f0  [Bug] Enable to get TCP metrics for linux kernel 2.x (#3921)
46c64f0 is described below

commit 46c64f08612881901f6200c2688cfc5821b3290e
Author: Mingyu Chen <mo...@gmail.com>
AuthorDate: Wed Jun 24 21:29:07 2020 +0800

    [Bug] Enable to get TCP metrics for linux kernel 2.x (#3921)
    
    Fix #3920
    
    CL:
    1. Parse the TCP metrics header in `/proc/net/snmp` to get the right position of the metrics.
    2. Add 2 new metrics: `tcp_in_segs` and `tcp_out_segs`
---
 be/src/util/system_metrics.cpp                     | 37 +++++++++++++++++-----
 .../operation/monitor-metrics/be-metrics.md        | 16 ++++++++++
 .../operation/monitor-metrics/fe-metrics.md        | 16 ++++++++++
 .../operation/monitor-metrics/be-metrics.md        | 16 ++++++++++
 .../operation/monitor-metrics/fe-metrics.md        | 16 ++++++++++
 .../java/org/apache/doris/metric/MetricRepo.java   | 22 +++++++++++++
 .../org/apache/doris/metric/SystemMetrics.java     | 30 ++++++++++++++----
 7 files changed, 139 insertions(+), 14 deletions(-)

diff --git a/be/src/util/system_metrics.cpp b/be/src/util/system_metrics.cpp
index a572661..79a5907 100644
--- a/be/src/util/system_metrics.cpp
+++ b/be/src/util/system_metrics.cpp
@@ -16,6 +16,8 @@
 // under the License.
 
 #include "util/system_metrics.h"
+#include "gutil/strings/split.h" // for string split
+#include "gutil/strtoint.h" //  for atoi64
 
 #include <stdio.h>
 #include <gperftools/malloc_extension.h>
@@ -72,6 +74,10 @@ struct SnmpMetrics {
     METRIC_DEFINE_INT_LOCK_COUNTER(tcp_in_errs, MetricUnit::NOUNIT);
     // All TCP packets retransmitted
     METRIC_DEFINE_INT_LOCK_COUNTER(tcp_retrans_segs, MetricUnit::NOUNIT);
+    // All received TCP packets
+    METRIC_DEFINE_INT_LOCK_COUNTER(tcp_in_segs, MetricUnit::NOUNIT);
+    // All send TCP packets with RST mark
+    METRIC_DEFINE_INT_LOCK_COUNTER(tcp_out_segs, MetricUnit::NOUNIT);
 };
 
 struct FileDescriptorMetrics {
@@ -323,6 +329,8 @@ void SystemMetrics::_install_snmp_metrics(MetricRegistry* registry) {
                                   &_snmp_metrics->name)
     REGISTER_SNMP_METRIC(tcp_in_errs);
     REGISTER_SNMP_METRIC(tcp_retrans_segs);
+    REGISTER_SNMP_METRIC(tcp_in_segs);
+    REGISTER_SNMP_METRIC(tcp_out_segs);
 }
 
 void SystemMetrics::_update_net_metrics() {
@@ -449,8 +457,16 @@ void SystemMetrics::_update_snmp_metrics() {
         return;
     }
 
-    // skip the Tcp header line
+    // parse the Tcp header
     // Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors
+    std::vector<std::string> headers = strings::Split(_line_ptr, " ");
+    std::unordered_map<std::string, int32_t> header_map;
+    int32_t pos = 0;
+    for (auto& h : headers) {
+        header_map.emplace(h, pos++);
+    }
+
+    // read the metrics of TCP
     if (getline(&_line_ptr, &_line_buf_size, fp) < 0) {
         char buf[64];
         LOG(WARNING) << "failed to skip Tcp header line of /proc/net/snmp, errno=" << errno
@@ -461,15 +477,20 @@ void SystemMetrics::_update_snmp_metrics() {
 
     // metric line looks like:
     // Tcp: 1 200 120000 -1 47849374 38601877 3353843 2320314 276 1033354613 1166025166 825439 12694 23238924 0
-    int64_t retrans_segs = 0;
-    int64_t in_errs = 0;
-    sscanf(_line_ptr,
-            "Tcp: %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d"
-            " %" PRId64 " %" PRId64 " %*d %*d",
-            &retrans_segs, &in_errs);
-
+    std::vector<std::string> metrics = strings::Split(_line_ptr, " ");
+    if (metrics.size() != headers.size()) {
+        LOG(WARNING) << "invalid tcp metrics line: " << _line_ptr;
+        fclose(fp);
+        return;
+    }
+    int64_t retrans_segs = atoi64(metrics[header_map["RetransSegs"]]);
+    int64_t in_errs = atoi64(metrics[header_map["InErrs"]]);
+    int64_t in_segs = atoi64(metrics[header_map["InSegs"]]);
+    int64_t out_segs = atoi64(metrics[header_map["OutSegs"]]);
     _snmp_metrics->tcp_retrans_segs.set_value(retrans_segs);
     _snmp_metrics->tcp_in_errs.set_value(in_errs);
+    _snmp_metrics->tcp_in_segs.set_value(in_segs);
+    _snmp_metrics->tcp_out_segs.set_value(out_segs);
 
     if (ferror(fp) != 0) {
         char buf[64];
diff --git a/docs/en/administrator-guide/operation/monitor-metrics/be-metrics.md b/docs/en/administrator-guide/operation/monitor-metrics/be-metrics.md
index 642228f..a2716a7 100644
--- a/docs/en/administrator-guide/operation/monitor-metrics/be-metrics.md
+++ b/docs/en/administrator-guide/operation/monitor-metrics/be-metrics.md
@@ -59,3 +59,19 @@ Value of the `Tcp: RetransSegs` field in `/proc/net/snmp`. Represents the number
 The incidence rate can be calculated in combination with the sampling period.
 
 Usually used to troubleshoot network problems.
+
+### `doris_be_snmp{name="tcp_in_segs"}`
+
+Value of the `Tcp: InSegs` field in `/proc/net/snmp`. Represents the number of receivied TCP packets.
+
+Use `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` can calculate the error rate of received TCP packets.
+
+Usually used to troubleshoot network problems.
+
+### `doris_be_snmp{name="tcp_out_segs"}`
+
+Value of the `Tcp: OutSegs` field in `/proc/net/snmp`. Represents the number of send TCP packets with RST mark.
+
+Use `(NEW_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` can calculate the retrans rate of TCP packets.
+
+Usually used to troubleshoot network problems.
diff --git a/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md b/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md
index 9c5487f..26ba0ac 100644
--- a/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md
+++ b/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md
@@ -59,3 +59,19 @@ Value of the `Tcp: RetransSegs` field in `/proc/net/snmp`. Represents the number
 The incidence rate can be calculated in combination with the sampling period.
 
 Usually used to troubleshoot network problems.
+
+### `doris_fe_snmp{name="tcp_in_segs"}`
+
+Value of the `Tcp: InSegs` field in `/proc/net/snmp`. Represents the number of receivied TCP packets.
+
+Use `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` can calculate the error rate of received TCP packets.
+
+Usually used to troubleshoot network problems.
+
+### `doris_fe_snmp{name="tcp_out_segs"}`
+
+Value of the `Tcp: OutSegs` field in `/proc/net/snmp`. Represents the number of send TCP packets with RST mark.
+
+Use `(NEW_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` can calculate the retrans rate of TCP packets.
+
+Usually used to troubleshoot network problems.
diff --git a/docs/zh-CN/administrator-guide/operation/monitor-metrics/be-metrics.md b/docs/zh-CN/administrator-guide/operation/monitor-metrics/be-metrics.md
index 1a2afde..41533b8 100644
--- a/docs/zh-CN/administrator-guide/operation/monitor-metrics/be-metrics.md
+++ b/docs/zh-CN/administrator-guide/operation/monitor-metrics/be-metrics.md
@@ -59,3 +59,19 @@ BE 的监控项可以通过以下方式访问:
 结合采样周期可以计算发生率。
 
 通常用于排查网络问题。
+
+### `doris_be_snmp{name="tcp_in_segs"}`
+
+该监控项为 `/proc/net/snmp` 中的 `Tcp: InSegs` 字段值。表示当前接收到的所有 TCP 包的数量。
+
+通过 `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` 可以计算接收到的 TCP 错误包率。
+
+通常用于排查网络问题。
+
+### `doris_be_snmp{name="tcp_out_segs"}`
+
+该监控项为 `/proc/net/snmp` 中的 `Tcp: OutSegs` 字段值。表示当前发送的所有带 RST 标记的 TCP 包的数量。
+
+通过 `(NEW_tcp_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` 可以计算 TCP 重传率。
+
+通常用于排查网络问题。
diff --git a/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md b/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md
index a938ae6..aaa1854 100644
--- a/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md
+++ b/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md
@@ -59,3 +59,19 @@ FE 的监控项可以通过以下方式访问:
 结合采样周期可以计算发生率。
 
 通常用于排查网络问题。
+
+### `doris_fe_snmp{name="tcp_in_segs"}`
+
+该监控项为 `/proc/net/snmp` 中的 `Tcp: InSegs` 字段值。表示当前接收到的所有 TCP 包的数量。
+
+通过 `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` 可以计算接收到的 TCP 错误包率。
+
+通常用于排查网络问题。
+
+### `doris_fe_snmp{name="tcp_out_segs"}`
+
+该监控项为 `/proc/net/snmp` 中的 `Tcp: OutSegs` 字段值。表示当前发送的所有带 RST 标记的 TCP 包的数量。
+
+通过 `(NEW_tcp_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` 可以计算 TCP 重传率。
+
+通常用于排查网络问题。
diff --git a/fe/src/main/java/org/apache/doris/metric/MetricRepo.java b/fe/src/main/java/org/apache/doris/metric/MetricRepo.java
index 3b2fd54..1fa58a2 100644
--- a/fe/src/main/java/org/apache/doris/metric/MetricRepo.java
+++ b/fe/src/main/java/org/apache/doris/metric/MetricRepo.java
@@ -275,6 +275,28 @@ public final class MetricRepo {
         };
         tpcInErrs.addLabel(new MetricLabel("name", "tcp_in_errs"));
         PALO_METRIC_REGISTER.addPaloMetrics(tpcInErrs);
+
+        // TCP inSegs
+        GaugeMetric<Long> tpcInSegs = (GaugeMetric<Long>) new GaugeMetric<Long>(
+                "snmp", MetricUnit.NOUNIT, "The number of all TCP packets received") {
+            @Override
+            public Long getValue() {
+                return SYSTEM_METRICS.tcpInSegs;
+            }
+        };
+        tpcInSegs.addLabel(new MetricLabel("name", "tcp_in_segs"));
+        PALO_METRIC_REGISTER.addPaloMetrics(tpcInSegs);
+
+        // TCP outSegs
+        GaugeMetric<Long> tpcOutSegs = (GaugeMetric<Long>) new GaugeMetric<Long>(
+                "snmp", MetricUnit.NOUNIT, "The number of all TCP packets send with RST") {
+            @Override
+            public Long getValue() {
+                return SYSTEM_METRICS.tcpOutSegs;
+            }
+        };
+        tpcOutSegs.addLabel(new MetricLabel("name", "tcp_out_segs"));
+        PALO_METRIC_REGISTER.addPaloMetrics(tpcOutSegs);
     }
 
     // to generate the metrics related to tablets of each backends
diff --git a/fe/src/main/java/org/apache/doris/metric/SystemMetrics.java b/fe/src/main/java/org/apache/doris/metric/SystemMetrics.java
index 78ec70f..15b221d 100644
--- a/fe/src/main/java/org/apache/doris/metric/SystemMetrics.java
+++ b/fe/src/main/java/org/apache/doris/metric/SystemMetrics.java
@@ -19,11 +19,14 @@ package org.apache.doris.metric;
 
 import org.apache.doris.common.FeConstants;
 
+import com.google.common.collect.Maps;
+
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 
 import java.io.BufferedReader;
 import java.io.FileReader;
+import java.util.Map;
 
 /**
  * Save system metrics such as CPU, MEM, IO, Networks.
@@ -38,6 +41,10 @@ public class SystemMetrics {
     protected long tcpRetransSegs = 0;
     // The number of all problematic TCP packets received
     protected long tcpInErrs = 0;
+    // All received TCP packets
+    protected long tcpInSegs = 0;
+    // All send TCP packets with RST mark
+    protected long tcpOutSegs = 0;
 
     public synchronized void update() {
         updateSnmpMetrics();
@@ -61,19 +68,30 @@ public class SystemMetrics {
             if (!found) {
                 throw new Exception("can not find tcp metrics");
             }
-            // skip tcp header line
+
+            // parse the header of TCP
+            String[] headers = line.split(" ");
+            Map<String, Integer> headerMap = Maps.newHashMap();
+            int pos = 0;
+            for (int i = 0; i < headers.length; i++) {
+                headerMap.put(headers[i], pos++);
+            }
+
+            // read the metrics of TCP
             if ((line = br.readLine()) == null) {
-                throw new Exception("failed to skip tcp metrics header");
+                throw new Exception("failed to read metrics of TCP");
             }
             
             // eg: Tcp: 1 200 120000 -1 38920626 10487279 105581903 300009 305 18079291213 15411998945 11808180 22905 4174570 0
             String[] parts = line.split(" ");
-            if (parts.length != 16) {
-                throw new Exception("invalid tcp metrics: " + line);
+            if (parts.length != headerMap.size()) {
+                throw new Exception("invalid tcp metrics: " + line + ". header size: " + headerMap.size());
             }
 
-            tcpRetransSegs = Long.valueOf(parts[12]);
-            tcpInErrs = Long.valueOf(parts[13]);
+            tcpRetransSegs = Long.valueOf(parts[headerMap.get("RetransSegs")]);
+            tcpInErrs = Long.valueOf(parts[headerMap.get("InErrs")]);
+            tcpInSegs = Long.valueOf(parts[headerMap.get("InSegs")]);
+            tcpOutSegs = Long.valueOf(parts[headerMap.get("OutSegs")]);
 
         } catch (Exception e) {
             LOG.warn("failed to get /proc/net/snmp", e);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org