You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@skywalking.apache.org by wu...@apache.org on 2021/05/25 06:19:46 UTC
[skywalking] 01/01: Fix counter misuse in the alarm core
This is an automated email from the ASF dual-hosted git repository.
wusheng pushed a commit to branch alarm-core-bug
in repository https://gitbox.apache.org/repos/asf/skywalking.git
commit 32d664172e1f0691fe516c8d9b8a418a89502211
Author: Wu Sheng <wu...@foxmail.com>
AuthorDate: Tue May 25 14:19:20 2021 +0800
Fix counter misuse in the alarm core
---
.../server/core/alarm/provider/RunningRule.java | 63 +++++++++++-----------
.../core/alarm/provider/RunningRuleTest.java | 63 +++++++++-------------
2 files changed, 59 insertions(+), 67 deletions(-)
diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
index 5ebd951..5d7e148 100644
--- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
+++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
@@ -29,7 +29,6 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
-
import lombok.RequiredArgsConstructor;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;
@@ -107,7 +106,11 @@ public class RunningRule {
Pattern.compile(alarmRule.getExcludeLabelsRegex()) : null;
this.formatter = new AlarmMessageFormatter(alarmRule.getMessage());
this.onlyAsCondition = alarmRule.isOnlyAsCondition();
- this.tags = alarmRule.getTags().entrySet().stream().map(e -> new Tag(e.getKey(), e.getValue())).collect(Collectors.toList());
+ this.tags = alarmRule.getTags()
+ .entrySet()
+ .stream()
+ .map(e -> new Tag(e.getKey(), e.getValue()))
+ .collect(Collectors.toList());
}
/**
@@ -146,12 +149,13 @@ public class RunningRule {
threshold.setType(MetricsValueType.MULTI_INTS);
} else if (metrics instanceof LabeledValueHolder) {
if (((LabeledValueHolder) metrics).getValue().keys().stream()
- .noneMatch(label -> validate(
- label,
- includeLabels,
- excludeLabels,
- includeLabelsRegex,
- excludeLabelsRegex))) {
+ .noneMatch(label -> validate(
+ label,
+ includeLabels,
+ excludeLabels,
+ includeLabelsRegex,
+ excludeLabelsRegex
+ ))) {
return;
}
valueType = MetricsValueType.LABELED_LONG;
@@ -169,11 +173,11 @@ public class RunningRule {
}
/**
- * Validate target whether matching rules which is included list, excludes list, include regular expression
- * or exclude regular expression.
+ * Validate target whether matching rules which is included list, excludes list, include regular expression or
+ * exclude regular expression.
*/
private boolean validate(String target, List<String> includeList, List<String> excludeList,
- Pattern includeRegex, Pattern excludeRegex) {
+ Pattern includeRegex, Pattern excludeRegex) {
if (CollectionUtils.isNotEmpty(includeList)) {
if (!includeList.contains(target)) {
if (log.isTraceEnabled()) {
@@ -256,7 +260,6 @@ public class RunningRule {
public class Window {
private LocalDateTime endTime;
private int period;
- private int counter;
private int silenceCountdown;
private LinkedList<Metrics> values;
@@ -266,7 +269,6 @@ public class RunningRule {
this.period = period;
// -1 means silence countdown is not running.
silenceCountdown = -1;
- counter = 0;
init();
}
@@ -320,7 +322,10 @@ public class RunningRule {
// too old data
// also should happen, but maybe if agent/probe mechanism time is not right.
if (log.isTraceEnabled()) {
- log.trace("Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime, values.size());
+ log.trace(
+ "Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime,
+ values.size()
+ );
}
return;
}
@@ -338,12 +343,10 @@ public class RunningRule {
if (isMatch()) {
/*
* When
- * 1. Metrics value threshold triggers alarm by rule
- * 2. Counter reaches the count threshold;
- * 3. Isn't in silence stage, judged by SilenceCountdown(!=0).
+ * 1. Alarm trigger conditions are satisfied.
+ * 2. Isn't in silence stage, judged by SilenceCountdown(!=0).
*/
- counter++;
- if (counter >= countThreshold && silenceCountdown < 1) {
+ if (silenceCountdown < 1) {
silenceCountdown = silencePeriod;
return Optional.of(new AlarmMessage());
} else {
@@ -351,9 +354,6 @@ public class RunningRule {
}
} else {
silenceCountdown--;
- if (counter > 0) {
- counter--;
- }
}
return Optional.empty();
}
@@ -415,13 +415,14 @@ public class RunningRule {
DataTable values = ((LabeledValueHolder) metrics).getValue();
lexpected = RunningRule.this.threshold.getLongThreshold();
if (values.keys().stream().anyMatch(label ->
- validate(
- label,
- RunningRule.this.includeLabels,
- RunningRule.this.excludeLabels,
- RunningRule.this.includeLabelsRegex,
- RunningRule.this.excludeLabelsRegex)
- && op.test(lexpected, values.get(label)))) {
+ validate(
+ label,
+ RunningRule.this.includeLabels,
+ RunningRule.this.excludeLabels,
+ RunningRule.this.includeLabelsRegex,
+ RunningRule.this.excludeLabelsRegex
+ )
+ && op.test(lexpected, values.get(label)))) {
matchCount++;
}
break;
@@ -466,7 +467,9 @@ public class RunningRule {
break;
case LABELED_LONG:
DataTable dt = ((LabeledValueHolder) m).getValue();
- TraceLogMetric l = new TraceLogMetric(m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder()).toArray(new Number[0]));
+ TraceLogMetric l = new TraceLogMetric(
+ m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder())
+ .toArray(new Number[0]));
l.labels = dt.sortedKeys(Comparator.naturalOrder()).toArray(new String[0]);
r.add(l);
}
diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java
index a7229ec..a025aeb 100644
--- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java
+++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java
@@ -19,11 +19,11 @@
package org.apache.skywalking.oap.server.core.alarm.provider;
import com.google.common.collect.Lists;
+import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
-import java.util.HashMap;
import lombok.Getter;
import lombok.Setter;
import org.apache.skywalking.oap.server.core.Const;
@@ -103,21 +103,16 @@ public class RunningRuleTest {
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70));
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71));
- runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
// check at 201808301440
List<AlarmMessage> alarmMessages = runningRule.check();
Assert.assertEquals(0, alarmMessages.size());
- runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
- // check at 201808301441
- alarmMessages = runningRule.check();
- Assert.assertEquals(0, alarmMessages.size());
- runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
- // check at 201808301442
+
+ runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
+
+ // check at 201808301440
alarmMessages = runningRule.check();
Assert.assertEquals(1, alarmMessages.size());
- Assert.assertEquals("Successful rate of endpoint Service_123 is lower than 75%", alarmMessages.get(0)
- .getAlarmMessage());
}
@Test
@@ -142,22 +137,18 @@ public class RunningRuleTest {
runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod1, 70, 60, 40, 40, 40));
runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod2, 60, 60, 40, 40, 40));
- runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod3, 74, 60, 40, 40, 40));
// check at 201808301440
List<AlarmMessage> alarmMessages = runningRule.check();
Assert.assertEquals(0, alarmMessages.size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
- // check at 201808301441
- alarmMessages = runningRule.check();
- Assert.assertEquals(0, alarmMessages.size());
- runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
- // check at 201808301442
+
+ runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod3, 74, 60, 40, 40, 40));
+
+ // check at 201808301440
alarmMessages = runningRule.check();
Assert.assertEquals(1, alarmMessages.size());
- Assert.assertEquals(
- "response percentile of endpoint Service_123 is lower than expected values", alarmMessages.get(0)
- .getAlarmMessage());
+ runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
}
@Test
@@ -243,16 +234,18 @@ public class RunningRuleTest {
long timeInPeriod3 = 201808301438L;
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70));
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71));
- runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
// check at 201808301440
Assert.assertEquals(0, runningRule.check().size()); //check matches, no alarm
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
- // check at 201808301441
- Assert.assertEquals(0, runningRule.check().size()); //check matches, no alarm
- runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
+
+ runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
+
+ // check at 201808301440
+ Assert.assertEquals(1, runningRule.check().size()); //alarm
+ runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
+
// check at 201808301442
- Assert.assertNotEquals(0, runningRule.check().size()); //alarm
Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm
Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm
Assert.assertNotEquals(0, runningRule.check().size()); //alarm
@@ -304,7 +297,8 @@ public class RunningRuleTest {
alarmRule.setThreshold("1000");
alarmRule.setCount(1);
alarmRule.setPeriod(10);
- alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
+ alarmRule.setMessage(
+ "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
alarmRule.setIncludeNamesRegex("Service\\_1(\\d)+");
alarmRule.setTags(new HashMap<String, String>() {{
put("key", "value");
@@ -338,7 +332,8 @@ public class RunningRuleTest {
alarmRule.setThreshold("1000");
alarmRule.setCount(1);
alarmRule.setPeriod(10);
- alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
+ alarmRule.setMessage(
+ "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
alarmRule.setExcludeNamesRegex("Service\\_2(\\d)+");
alarmRule.setTags(new HashMap<String, String>() {{
put("key", "value");
@@ -603,22 +598,16 @@ public class RunningRuleTest {
runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod1, "50,17|99,11"));
runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod2, "75,15|95,12"));
- runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod3, "90,1|99,20"));
- // check at 201808301440
List<AlarmMessage> alarmMessages = runningRule.check();
Assert.assertEquals(0, alarmMessages.size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
- // check at 201808301441
- alarmMessages = runningRule.check();
- Assert.assertEquals(0, alarmMessages.size());
- runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
- // check at 201808301442
+
+ runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod3, "90,1|99,20"));
+
+ // check at 201808301440
alarmMessages = runningRule.check();
Assert.assertEquals(1, alarmMessages.size());
- Assert.assertEquals(
- "response percentile of endpoint Service_123 is lower than expected value", alarmMessages.get(0)
- .getAlarmMessage());
-
+ runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
}
}