You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2020/06/24 13:29:14 UTC
[incubator-doris] branch master updated: [Bug] Enable to get TCP
metrics for linux kernel 2.x (#3921)
This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new 46c64f0 [Bug] Enable to get TCP metrics for linux kernel 2.x (#3921)
46c64f0 is described below
commit 46c64f08612881901f6200c2688cfc5821b3290e
Author: Mingyu Chen <mo...@gmail.com>
AuthorDate: Wed Jun 24 21:29:07 2020 +0800
[Bug] Enable to get TCP metrics for linux kernel 2.x (#3921)
Fix #3920
CL:
1. Parse the TCP metrics header in `/proc/net/snmp` to get the right position of the metrics.
2. Add 2 new metrics: `tcp_in_segs` and `tcp_out_segs`
---
be/src/util/system_metrics.cpp | 37 +++++++++++++++++-----
.../operation/monitor-metrics/be-metrics.md | 16 ++++++++++
.../operation/monitor-metrics/fe-metrics.md | 16 ++++++++++
.../operation/monitor-metrics/be-metrics.md | 16 ++++++++++
.../operation/monitor-metrics/fe-metrics.md | 16 ++++++++++
.../java/org/apache/doris/metric/MetricRepo.java | 22 +++++++++++++
.../org/apache/doris/metric/SystemMetrics.java | 30 ++++++++++++++----
7 files changed, 139 insertions(+), 14 deletions(-)
diff --git a/be/src/util/system_metrics.cpp b/be/src/util/system_metrics.cpp
index a572661..79a5907 100644
--- a/be/src/util/system_metrics.cpp
+++ b/be/src/util/system_metrics.cpp
@@ -16,6 +16,8 @@
// under the License.
#include "util/system_metrics.h"
+#include "gutil/strings/split.h" // for string split
+#include "gutil/strtoint.h" // for atoi64
#include <stdio.h>
#include <gperftools/malloc_extension.h>
@@ -72,6 +74,10 @@ struct SnmpMetrics {
METRIC_DEFINE_INT_LOCK_COUNTER(tcp_in_errs, MetricUnit::NOUNIT);
// All TCP packets retransmitted
METRIC_DEFINE_INT_LOCK_COUNTER(tcp_retrans_segs, MetricUnit::NOUNIT);
+ // All received TCP packets
+ METRIC_DEFINE_INT_LOCK_COUNTER(tcp_in_segs, MetricUnit::NOUNIT);
+ // All send TCP packets with RST mark
+ METRIC_DEFINE_INT_LOCK_COUNTER(tcp_out_segs, MetricUnit::NOUNIT);
};
struct FileDescriptorMetrics {
@@ -323,6 +329,8 @@ void SystemMetrics::_install_snmp_metrics(MetricRegistry* registry) {
&_snmp_metrics->name)
REGISTER_SNMP_METRIC(tcp_in_errs);
REGISTER_SNMP_METRIC(tcp_retrans_segs);
+ REGISTER_SNMP_METRIC(tcp_in_segs);
+ REGISTER_SNMP_METRIC(tcp_out_segs);
}
void SystemMetrics::_update_net_metrics() {
@@ -449,8 +457,16 @@ void SystemMetrics::_update_snmp_metrics() {
return;
}
- // skip the Tcp header line
+ // parse the Tcp header
// Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors
+ std::vector<std::string> headers = strings::Split(_line_ptr, " ");
+ std::unordered_map<std::string, int32_t> header_map;
+ int32_t pos = 0;
+ for (auto& h : headers) {
+ header_map.emplace(h, pos++);
+ }
+
+ // read the metrics of TCP
if (getline(&_line_ptr, &_line_buf_size, fp) < 0) {
char buf[64];
LOG(WARNING) << "failed to skip Tcp header line of /proc/net/snmp, errno=" << errno
@@ -461,15 +477,20 @@ void SystemMetrics::_update_snmp_metrics() {
// metric line looks like:
// Tcp: 1 200 120000 -1 47849374 38601877 3353843 2320314 276 1033354613 1166025166 825439 12694 23238924 0
- int64_t retrans_segs = 0;
- int64_t in_errs = 0;
- sscanf(_line_ptr,
- "Tcp: %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d"
- " %" PRId64 " %" PRId64 " %*d %*d",
- &retrans_segs, &in_errs);
-
+ std::vector<std::string> metrics = strings::Split(_line_ptr, " ");
+ if (metrics.size() != headers.size()) {
+ LOG(WARNING) << "invalid tcp metrics line: " << _line_ptr;
+ fclose(fp);
+ return;
+ }
+ int64_t retrans_segs = atoi64(metrics[header_map["RetransSegs"]]);
+ int64_t in_errs = atoi64(metrics[header_map["InErrs"]]);
+ int64_t in_segs = atoi64(metrics[header_map["InSegs"]]);
+ int64_t out_segs = atoi64(metrics[header_map["OutSegs"]]);
_snmp_metrics->tcp_retrans_segs.set_value(retrans_segs);
_snmp_metrics->tcp_in_errs.set_value(in_errs);
+ _snmp_metrics->tcp_in_segs.set_value(in_segs);
+ _snmp_metrics->tcp_out_segs.set_value(out_segs);
if (ferror(fp) != 0) {
char buf[64];
diff --git a/docs/en/administrator-guide/operation/monitor-metrics/be-metrics.md b/docs/en/administrator-guide/operation/monitor-metrics/be-metrics.md
index 642228f..a2716a7 100644
--- a/docs/en/administrator-guide/operation/monitor-metrics/be-metrics.md
+++ b/docs/en/administrator-guide/operation/monitor-metrics/be-metrics.md
@@ -59,3 +59,19 @@ Value of the `Tcp: RetransSegs` field in `/proc/net/snmp`. Represents the number
The incidence rate can be calculated in combination with the sampling period.
Usually used to troubleshoot network problems.
+
+### `doris_be_snmp{name="tcp_in_segs"}`
+
+Value of the `Tcp: InSegs` field in `/proc/net/snmp`. Represents the number of receivied TCP packets.
+
+Use `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` can calculate the error rate of received TCP packets.
+
+Usually used to troubleshoot network problems.
+
+### `doris_be_snmp{name="tcp_out_segs"}`
+
+Value of the `Tcp: OutSegs` field in `/proc/net/snmp`. Represents the number of send TCP packets with RST mark.
+
+Use `(NEW_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` can calculate the retrans rate of TCP packets.
+
+Usually used to troubleshoot network problems.
diff --git a/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md b/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md
index 9c5487f..26ba0ac 100644
--- a/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md
+++ b/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md
@@ -59,3 +59,19 @@ Value of the `Tcp: RetransSegs` field in `/proc/net/snmp`. Represents the number
The incidence rate can be calculated in combination with the sampling period.
Usually used to troubleshoot network problems.
+
+### `doris_fe_snmp{name="tcp_in_segs"}`
+
+Value of the `Tcp: InSegs` field in `/proc/net/snmp`. Represents the number of receivied TCP packets.
+
+Use `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` can calculate the error rate of received TCP packets.
+
+Usually used to troubleshoot network problems.
+
+### `doris_fe_snmp{name="tcp_out_segs"}`
+
+Value of the `Tcp: OutSegs` field in `/proc/net/snmp`. Represents the number of send TCP packets with RST mark.
+
+Use `(NEW_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` can calculate the retrans rate of TCP packets.
+
+Usually used to troubleshoot network problems.
diff --git a/docs/zh-CN/administrator-guide/operation/monitor-metrics/be-metrics.md b/docs/zh-CN/administrator-guide/operation/monitor-metrics/be-metrics.md
index 1a2afde..41533b8 100644
--- a/docs/zh-CN/administrator-guide/operation/monitor-metrics/be-metrics.md
+++ b/docs/zh-CN/administrator-guide/operation/monitor-metrics/be-metrics.md
@@ -59,3 +59,19 @@ BE 的监控项可以通过以下方式访问:
结合采样周期可以计算发生率。
通常用于排查网络问题。
+
+### `doris_be_snmp{name="tcp_in_segs"}`
+
+该监控项为 `/proc/net/snmp` 中的 `Tcp: InSegs` 字段值。表示当前接收到的所有 TCP 包的数量。
+
+通过 `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` 可以计算接收到的 TCP 错误包率。
+
+通常用于排查网络问题。
+
+### `doris_be_snmp{name="tcp_out_segs"}`
+
+该监控项为 `/proc/net/snmp` 中的 `Tcp: OutSegs` 字段值。表示当前发送的所有带 RST 标记的 TCP 包的数量。
+
+通过 `(NEW_tcp_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` 可以计算 TCP 重传率。
+
+通常用于排查网络问题。
diff --git a/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md b/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md
index a938ae6..aaa1854 100644
--- a/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md
+++ b/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md
@@ -59,3 +59,19 @@ FE 的监控项可以通过以下方式访问:
结合采样周期可以计算发生率。
通常用于排查网络问题。
+
+### `doris_fe_snmp{name="tcp_in_segs"}`
+
+该监控项为 `/proc/net/snmp` 中的 `Tcp: InSegs` 字段值。表示当前接收到的所有 TCP 包的数量。
+
+通过 `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` 可以计算接收到的 TCP 错误包率。
+
+通常用于排查网络问题。
+
+### `doris_fe_snmp{name="tcp_out_segs"}`
+
+该监控项为 `/proc/net/snmp` 中的 `Tcp: OutSegs` 字段值。表示当前发送的所有带 RST 标记的 TCP 包的数量。
+
+通过 `(NEW_tcp_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` 可以计算 TCP 重传率。
+
+通常用于排查网络问题。
diff --git a/fe/src/main/java/org/apache/doris/metric/MetricRepo.java b/fe/src/main/java/org/apache/doris/metric/MetricRepo.java
index 3b2fd54..1fa58a2 100644
--- a/fe/src/main/java/org/apache/doris/metric/MetricRepo.java
+++ b/fe/src/main/java/org/apache/doris/metric/MetricRepo.java
@@ -275,6 +275,28 @@ public final class MetricRepo {
};
tpcInErrs.addLabel(new MetricLabel("name", "tcp_in_errs"));
PALO_METRIC_REGISTER.addPaloMetrics(tpcInErrs);
+
+ // TCP inSegs
+ GaugeMetric<Long> tpcInSegs = (GaugeMetric<Long>) new GaugeMetric<Long>(
+ "snmp", MetricUnit.NOUNIT, "The number of all TCP packets received") {
+ @Override
+ public Long getValue() {
+ return SYSTEM_METRICS.tcpInSegs;
+ }
+ };
+ tpcInSegs.addLabel(new MetricLabel("name", "tcp_in_segs"));
+ PALO_METRIC_REGISTER.addPaloMetrics(tpcInSegs);
+
+ // TCP outSegs
+ GaugeMetric<Long> tpcOutSegs = (GaugeMetric<Long>) new GaugeMetric<Long>(
+ "snmp", MetricUnit.NOUNIT, "The number of all TCP packets send with RST") {
+ @Override
+ public Long getValue() {
+ return SYSTEM_METRICS.tcpOutSegs;
+ }
+ };
+ tpcOutSegs.addLabel(new MetricLabel("name", "tcp_out_segs"));
+ PALO_METRIC_REGISTER.addPaloMetrics(tpcOutSegs);
}
// to generate the metrics related to tablets of each backends
diff --git a/fe/src/main/java/org/apache/doris/metric/SystemMetrics.java b/fe/src/main/java/org/apache/doris/metric/SystemMetrics.java
index 78ec70f..15b221d 100644
--- a/fe/src/main/java/org/apache/doris/metric/SystemMetrics.java
+++ b/fe/src/main/java/org/apache/doris/metric/SystemMetrics.java
@@ -19,11 +19,14 @@ package org.apache.doris.metric;
import org.apache.doris.common.FeConstants;
+import com.google.common.collect.Maps;
+
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.BufferedReader;
import java.io.FileReader;
+import java.util.Map;
/**
* Save system metrics such as CPU, MEM, IO, Networks.
@@ -38,6 +41,10 @@ public class SystemMetrics {
protected long tcpRetransSegs = 0;
// The number of all problematic TCP packets received
protected long tcpInErrs = 0;
+ // All received TCP packets
+ protected long tcpInSegs = 0;
+ // All send TCP packets with RST mark
+ protected long tcpOutSegs = 0;
public synchronized void update() {
updateSnmpMetrics();
@@ -61,19 +68,30 @@ public class SystemMetrics {
if (!found) {
throw new Exception("can not find tcp metrics");
}
- // skip tcp header line
+
+ // parse the header of TCP
+ String[] headers = line.split(" ");
+ Map<String, Integer> headerMap = Maps.newHashMap();
+ int pos = 0;
+ for (int i = 0; i < headers.length; i++) {
+ headerMap.put(headers[i], pos++);
+ }
+
+ // read the metrics of TCP
if ((line = br.readLine()) == null) {
- throw new Exception("failed to skip tcp metrics header");
+ throw new Exception("failed to read metrics of TCP");
}
// eg: Tcp: 1 200 120000 -1 38920626 10487279 105581903 300009 305 18079291213 15411998945 11808180 22905 4174570 0
String[] parts = line.split(" ");
- if (parts.length != 16) {
- throw new Exception("invalid tcp metrics: " + line);
+ if (parts.length != headerMap.size()) {
+ throw new Exception("invalid tcp metrics: " + line + ". header size: " + headerMap.size());
}
- tcpRetransSegs = Long.valueOf(parts[12]);
- tcpInErrs = Long.valueOf(parts[13]);
+ tcpRetransSegs = Long.valueOf(parts[headerMap.get("RetransSegs")]);
+ tcpInErrs = Long.valueOf(parts[headerMap.get("InErrs")]);
+ tcpInSegs = Long.valueOf(parts[headerMap.get("InSegs")]);
+ tcpOutSegs = Long.valueOf(parts[headerMap.get("OutSegs")]);
} catch (Exception e) {
LOG.warn("failed to get /proc/net/snmp", e);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org