You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@skywalking.apache.org by wu...@apache.org on 2021/05/25 06:19:46 UTC

[skywalking] 01/01: Fix counter misuse in the alarm core

This is an automated email from the ASF dual-hosted git repository.

wusheng pushed a commit to branch alarm-core-bug
in repository https://gitbox.apache.org/repos/asf/skywalking.git

commit 32d664172e1f0691fe516c8d9b8a418a89502211
Author: Wu Sheng <wu...@foxmail.com>
AuthorDate: Tue May 25 14:19:20 2021 +0800

    Fix counter misuse in the alarm core
---
 .../server/core/alarm/provider/RunningRule.java    | 63 +++++++++++-----------
 .../core/alarm/provider/RunningRuleTest.java       | 63 +++++++++-------------
 2 files changed, 59 insertions(+), 67 deletions(-)

diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
index 5ebd951..5d7e148 100644
--- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
+++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
@@ -29,7 +29,6 @@ import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.locks.ReentrantLock;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
-
 import lombok.RequiredArgsConstructor;
 import lombok.ToString;
 import lombok.extern.slf4j.Slf4j;
@@ -107,7 +106,11 @@ public class RunningRule {
             Pattern.compile(alarmRule.getExcludeLabelsRegex()) : null;
         this.formatter = new AlarmMessageFormatter(alarmRule.getMessage());
         this.onlyAsCondition = alarmRule.isOnlyAsCondition();
-        this.tags = alarmRule.getTags().entrySet().stream().map(e -> new Tag(e.getKey(), e.getValue())).collect(Collectors.toList());
+        this.tags = alarmRule.getTags()
+                             .entrySet()
+                             .stream()
+                             .map(e -> new Tag(e.getKey(), e.getValue()))
+                             .collect(Collectors.toList());
     }
 
     /**
@@ -146,12 +149,13 @@ public class RunningRule {
                 threshold.setType(MetricsValueType.MULTI_INTS);
             } else if (metrics instanceof LabeledValueHolder) {
                 if (((LabeledValueHolder) metrics).getValue().keys().stream()
-                    .noneMatch(label -> validate(
-                        label,
-                        includeLabels,
-                        excludeLabels,
-                        includeLabelsRegex,
-                        excludeLabelsRegex))) {
+                                                  .noneMatch(label -> validate(
+                                                      label,
+                                                      includeLabels,
+                                                      excludeLabels,
+                                                      includeLabelsRegex,
+                                                      excludeLabelsRegex
+                                                  ))) {
                     return;
                 }
                 valueType = MetricsValueType.LABELED_LONG;
@@ -169,11 +173,11 @@ public class RunningRule {
     }
 
     /**
-     * Validate target whether matching rules which is included list, excludes list, include regular expression
-     * or exclude regular expression.
+     * Validate target whether matching rules which is included list, excludes list, include regular expression or
+     * exclude regular expression.
      */
     private boolean validate(String target, List<String> includeList, List<String> excludeList,
-        Pattern includeRegex, Pattern excludeRegex) {
+                             Pattern includeRegex, Pattern excludeRegex) {
         if (CollectionUtils.isNotEmpty(includeList)) {
             if (!includeList.contains(target)) {
                 if (log.isTraceEnabled()) {
@@ -256,7 +260,6 @@ public class RunningRule {
     public class Window {
         private LocalDateTime endTime;
         private int period;
-        private int counter;
         private int silenceCountdown;
 
         private LinkedList<Metrics> values;
@@ -266,7 +269,6 @@ public class RunningRule {
             this.period = period;
             // -1 means silence countdown is not running.
             silenceCountdown = -1;
-            counter = 0;
             init();
         }
 
@@ -320,7 +322,10 @@ public class RunningRule {
                     // too old data
                     // also should happen, but maybe if agent/probe mechanism time is not right.
                     if (log.isTraceEnabled()) {
-                        log.trace("Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime, values.size());
+                        log.trace(
+                            "Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime,
+                            values.size()
+                        );
                     }
                     return;
                 }
@@ -338,12 +343,10 @@ public class RunningRule {
             if (isMatch()) {
                 /*
                  * When
-                 * 1. Metrics value threshold triggers alarm by rule
-                 * 2. Counter reaches the count threshold;
-                 * 3. Isn't in silence stage, judged by SilenceCountdown(!=0).
+                 * 1. Alarm trigger conditions are satisfied.
+                 * 2. Isn't in silence stage, judged by SilenceCountdown(!=0).
                  */
-                counter++;
-                if (counter >= countThreshold && silenceCountdown < 1) {
+                if (silenceCountdown < 1) {
                     silenceCountdown = silencePeriod;
                     return Optional.of(new AlarmMessage());
                 } else {
@@ -351,9 +354,6 @@ public class RunningRule {
                 }
             } else {
                 silenceCountdown--;
-                if (counter > 0) {
-                    counter--;
-                }
             }
             return Optional.empty();
         }
@@ -415,13 +415,14 @@ public class RunningRule {
                         DataTable values = ((LabeledValueHolder) metrics).getValue();
                         lexpected = RunningRule.this.threshold.getLongThreshold();
                         if (values.keys().stream().anyMatch(label ->
-                            validate(
-                                label,
-                                RunningRule.this.includeLabels,
-                                RunningRule.this.excludeLabels,
-                                RunningRule.this.includeLabelsRegex,
-                                RunningRule.this.excludeLabelsRegex)
-                            && op.test(lexpected, values.get(label)))) {
+                                                                validate(
+                                                                    label,
+                                                                    RunningRule.this.includeLabels,
+                                                                    RunningRule.this.excludeLabels,
+                                                                    RunningRule.this.includeLabelsRegex,
+                                                                    RunningRule.this.excludeLabelsRegex
+                                                                )
+                                                                    && op.test(lexpected, values.get(label)))) {
                             matchCount++;
                         }
                         break;
@@ -466,7 +467,9 @@ public class RunningRule {
                     break;
                 case LABELED_LONG:
                     DataTable dt = ((LabeledValueHolder) m).getValue();
-                    TraceLogMetric l = new TraceLogMetric(m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder()).toArray(new Number[0]));
+                    TraceLogMetric l = new TraceLogMetric(
+                        m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder())
+                                             .toArray(new Number[0]));
                     l.labels = dt.sortedKeys(Comparator.naturalOrder()).toArray(new String[0]);
                     r.add(l);
             }
diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java
index a7229ec..a025aeb 100644
--- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java
+++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java
@@ -19,11 +19,11 @@
 package org.apache.skywalking.oap.server.core.alarm.provider;
 
 import com.google.common.collect.Lists;
+import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
-import java.util.HashMap;
 import lombok.Getter;
 import lombok.Setter;
 import org.apache.skywalking.oap.server.core.Const;
@@ -103,21 +103,16 @@ public class RunningRuleTest {
 
         runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70));
         runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71));
-        runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
 
         // check at 201808301440
         List<AlarmMessage> alarmMessages = runningRule.check();
         Assert.assertEquals(0, alarmMessages.size());
-        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
-        // check at 201808301441
-        alarmMessages = runningRule.check();
-        Assert.assertEquals(0, alarmMessages.size());
-        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
-        // check at 201808301442
+
+        runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
+
+        // check at 201808301440
         alarmMessages = runningRule.check();
         Assert.assertEquals(1, alarmMessages.size());
-        Assert.assertEquals("Successful rate of endpoint Service_123 is lower than 75%", alarmMessages.get(0)
-                                                                                                      .getAlarmMessage());
     }
 
     @Test
@@ -142,22 +137,18 @@ public class RunningRuleTest {
 
         runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod1, 70, 60, 40, 40, 40));
         runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod2, 60, 60, 40, 40, 40));
-        runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod3, 74, 60, 40, 40, 40));
 
         // check at 201808301440
         List<AlarmMessage> alarmMessages = runningRule.check();
         Assert.assertEquals(0, alarmMessages.size());
         runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
-        // check at 201808301441
-        alarmMessages = runningRule.check();
-        Assert.assertEquals(0, alarmMessages.size());
-        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
-        // check at 201808301442
+
+        runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod3, 74, 60, 40, 40, 40));
+
+        // check at 201808301440
         alarmMessages = runningRule.check();
         Assert.assertEquals(1, alarmMessages.size());
-        Assert.assertEquals(
-            "response percentile of endpoint Service_123 is lower than expected values", alarmMessages.get(0)
-                                                                                                      .getAlarmMessage());
+        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
     }
 
     @Test
@@ -243,16 +234,18 @@ public class RunningRuleTest {
         long timeInPeriod3 = 201808301438L;
         runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70));
         runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71));
-        runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
 
         // check at 201808301440
         Assert.assertEquals(0, runningRule.check().size()); //check matches, no alarm
         runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
-        // check at 201808301441
-        Assert.assertEquals(0, runningRule.check().size()); //check matches, no alarm
-        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
+
+        runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
+
+        // check at 201808301440
+        Assert.assertEquals(1, runningRule.check().size()); //alarm
+        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
+
         // check at 201808301442
-        Assert.assertNotEquals(0, runningRule.check().size()); //alarm
         Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm
         Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm
         Assert.assertNotEquals(0, runningRule.check().size()); //alarm
@@ -304,7 +297,8 @@ public class RunningRuleTest {
         alarmRule.setThreshold("1000");
         alarmRule.setCount(1);
         alarmRule.setPeriod(10);
-        alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
+        alarmRule.setMessage(
+            "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
         alarmRule.setIncludeNamesRegex("Service\\_1(\\d)+");
         alarmRule.setTags(new HashMap<String, String>() {{
             put("key", "value");
@@ -338,7 +332,8 @@ public class RunningRuleTest {
         alarmRule.setThreshold("1000");
         alarmRule.setCount(1);
         alarmRule.setPeriod(10);
-        alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
+        alarmRule.setMessage(
+            "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
         alarmRule.setExcludeNamesRegex("Service\\_2(\\d)+");
         alarmRule.setTags(new HashMap<String, String>() {{
             put("key", "value");
@@ -603,22 +598,16 @@ public class RunningRuleTest {
 
         runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod1, "50,17|99,11"));
         runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod2, "75,15|95,12"));
-        runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod3, "90,1|99,20"));
 
-        // check at 201808301440
         List<AlarmMessage> alarmMessages = runningRule.check();
         Assert.assertEquals(0, alarmMessages.size());
         runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
-        // check at 201808301441
-        alarmMessages = runningRule.check();
-        Assert.assertEquals(0, alarmMessages.size());
-        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
-        // check at 201808301442
+
+        runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod3, "90,1|99,20"));
+
+        // check at 201808301440
         alarmMessages = runningRule.check();
         Assert.assertEquals(1, alarmMessages.size());
-        Assert.assertEquals(
-            "response percentile of endpoint Service_123 is lower than expected value", alarmMessages.get(0)
-                .getAlarmMessage());
-
+        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
     }
 }