You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@skywalking.apache.org by wu...@apache.org on 2021/05/25 06:19:45 UTC

[skywalking] branch alarm-core-bug created (now 32d6641)

This is an automated email from the ASF dual-hosted git repository.

wusheng pushed a change to branch alarm-core-bug
in repository https://gitbox.apache.org/repos/asf/skywalking.git.


      at 32d6641  Fix counter misuse in the alarm core

This branch includes the following new commits:

     new 32d6641  Fix counter misuse in the alarm core

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


[skywalking] 01/01: Fix counter misuse in the alarm core

Posted by wu...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

wusheng pushed a commit to branch alarm-core-bug
in repository https://gitbox.apache.org/repos/asf/skywalking.git

commit 32d664172e1f0691fe516c8d9b8a418a89502211
Author: Wu Sheng <wu...@foxmail.com>
AuthorDate: Tue May 25 14:19:20 2021 +0800

    Fix counter misuse in the alarm core
---
 .../server/core/alarm/provider/RunningRule.java    | 63 +++++++++++-----------
 .../core/alarm/provider/RunningRuleTest.java       | 63 +++++++++-------------
 2 files changed, 59 insertions(+), 67 deletions(-)

diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
index 5ebd951..5d7e148 100644
--- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
+++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
@@ -29,7 +29,6 @@ import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.locks.ReentrantLock;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
-
 import lombok.RequiredArgsConstructor;
 import lombok.ToString;
 import lombok.extern.slf4j.Slf4j;
@@ -107,7 +106,11 @@ public class RunningRule {
             Pattern.compile(alarmRule.getExcludeLabelsRegex()) : null;
         this.formatter = new AlarmMessageFormatter(alarmRule.getMessage());
         this.onlyAsCondition = alarmRule.isOnlyAsCondition();
-        this.tags = alarmRule.getTags().entrySet().stream().map(e -> new Tag(e.getKey(), e.getValue())).collect(Collectors.toList());
+        this.tags = alarmRule.getTags()
+                             .entrySet()
+                             .stream()
+                             .map(e -> new Tag(e.getKey(), e.getValue()))
+                             .collect(Collectors.toList());
     }
 
     /**
@@ -146,12 +149,13 @@ public class RunningRule {
                 threshold.setType(MetricsValueType.MULTI_INTS);
             } else if (metrics instanceof LabeledValueHolder) {
                 if (((LabeledValueHolder) metrics).getValue().keys().stream()
-                    .noneMatch(label -> validate(
-                        label,
-                        includeLabels,
-                        excludeLabels,
-                        includeLabelsRegex,
-                        excludeLabelsRegex))) {
+                                                  .noneMatch(label -> validate(
+                                                      label,
+                                                      includeLabels,
+                                                      excludeLabels,
+                                                      includeLabelsRegex,
+                                                      excludeLabelsRegex
+                                                  ))) {
                     return;
                 }
                 valueType = MetricsValueType.LABELED_LONG;
@@ -169,11 +173,11 @@ public class RunningRule {
     }
 
     /**
-     * Validate target whether matching rules which is included list, excludes list, include regular expression
-     * or exclude regular expression.
+     * Validate target whether matching rules which is included list, excludes list, include regular expression or
+     * exclude regular expression.
      */
     private boolean validate(String target, List<String> includeList, List<String> excludeList,
-        Pattern includeRegex, Pattern excludeRegex) {
+                             Pattern includeRegex, Pattern excludeRegex) {
         if (CollectionUtils.isNotEmpty(includeList)) {
             if (!includeList.contains(target)) {
                 if (log.isTraceEnabled()) {
@@ -256,7 +260,6 @@ public class RunningRule {
     public class Window {
         private LocalDateTime endTime;
         private int period;
-        private int counter;
         private int silenceCountdown;
 
         private LinkedList<Metrics> values;
@@ -266,7 +269,6 @@ public class RunningRule {
             this.period = period;
             // -1 means silence countdown is not running.
             silenceCountdown = -1;
-            counter = 0;
             init();
         }
 
@@ -320,7 +322,10 @@ public class RunningRule {
                     // too old data
                     // also should happen, but maybe if agent/probe mechanism time is not right.
                     if (log.isTraceEnabled()) {
-                        log.trace("Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime, values.size());
+                        log.trace(
+                            "Timebucket is {}, endTime is {} and value size is {}", timeBucket, this.endTime,
+                            values.size()
+                        );
                     }
                     return;
                 }
@@ -338,12 +343,10 @@ public class RunningRule {
             if (isMatch()) {
                 /*
                  * When
-                 * 1. Metrics value threshold triggers alarm by rule
-                 * 2. Counter reaches the count threshold;
-                 * 3. Isn't in silence stage, judged by SilenceCountdown(!=0).
+                 * 1. Alarm trigger conditions are satisfied.
+                 * 2. Isn't in silence stage, judged by SilenceCountdown(!=0).
                  */
-                counter++;
-                if (counter >= countThreshold && silenceCountdown < 1) {
+                if (silenceCountdown < 1) {
                     silenceCountdown = silencePeriod;
                     return Optional.of(new AlarmMessage());
                 } else {
@@ -351,9 +354,6 @@ public class RunningRule {
                 }
             } else {
                 silenceCountdown--;
-                if (counter > 0) {
-                    counter--;
-                }
             }
             return Optional.empty();
         }
@@ -415,13 +415,14 @@ public class RunningRule {
                         DataTable values = ((LabeledValueHolder) metrics).getValue();
                         lexpected = RunningRule.this.threshold.getLongThreshold();
                         if (values.keys().stream().anyMatch(label ->
-                            validate(
-                                label,
-                                RunningRule.this.includeLabels,
-                                RunningRule.this.excludeLabels,
-                                RunningRule.this.includeLabelsRegex,
-                                RunningRule.this.excludeLabelsRegex)
-                            && op.test(lexpected, values.get(label)))) {
+                                                                validate(
+                                                                    label,
+                                                                    RunningRule.this.includeLabels,
+                                                                    RunningRule.this.excludeLabels,
+                                                                    RunningRule.this.includeLabelsRegex,
+                                                                    RunningRule.this.excludeLabelsRegex
+                                                                )
+                                                                    && op.test(lexpected, values.get(label)))) {
                             matchCount++;
                         }
                         break;
@@ -466,7 +467,9 @@ public class RunningRule {
                     break;
                 case LABELED_LONG:
                     DataTable dt = ((LabeledValueHolder) m).getValue();
-                    TraceLogMetric l = new TraceLogMetric(m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder()).toArray(new Number[0]));
+                    TraceLogMetric l = new TraceLogMetric(
+                        m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder())
+                                             .toArray(new Number[0]));
                     l.labels = dt.sortedKeys(Comparator.naturalOrder()).toArray(new String[0]);
                     r.add(l);
             }
diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java
index a7229ec..a025aeb 100644
--- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java
+++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java
@@ -19,11 +19,11 @@
 package org.apache.skywalking.oap.server.core.alarm.provider;
 
 import com.google.common.collect.Lists;
+import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
-import java.util.HashMap;
 import lombok.Getter;
 import lombok.Setter;
 import org.apache.skywalking.oap.server.core.Const;
@@ -103,21 +103,16 @@ public class RunningRuleTest {
 
         runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70));
         runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71));
-        runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
 
         // check at 201808301440
         List<AlarmMessage> alarmMessages = runningRule.check();
         Assert.assertEquals(0, alarmMessages.size());
-        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
-        // check at 201808301441
-        alarmMessages = runningRule.check();
-        Assert.assertEquals(0, alarmMessages.size());
-        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
-        // check at 201808301442
+
+        runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
+
+        // check at 201808301440
         alarmMessages = runningRule.check();
         Assert.assertEquals(1, alarmMessages.size());
-        Assert.assertEquals("Successful rate of endpoint Service_123 is lower than 75%", alarmMessages.get(0)
-                                                                                                      .getAlarmMessage());
     }
 
     @Test
@@ -142,22 +137,18 @@ public class RunningRuleTest {
 
         runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod1, 70, 60, 40, 40, 40));
         runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod2, 60, 60, 40, 40, 40));
-        runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod3, 74, 60, 40, 40, 40));
 
         // check at 201808301440
         List<AlarmMessage> alarmMessages = runningRule.check();
         Assert.assertEquals(0, alarmMessages.size());
         runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
-        // check at 201808301441
-        alarmMessages = runningRule.check();
-        Assert.assertEquals(0, alarmMessages.size());
-        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
-        // check at 201808301442
+
+        runningRule.in(getMetaInAlarm(123), getMultipleValueMetrics(timeInPeriod3, 74, 60, 40, 40, 40));
+
+        // check at 201808301440
         alarmMessages = runningRule.check();
         Assert.assertEquals(1, alarmMessages.size());
-        Assert.assertEquals(
-            "response percentile of endpoint Service_123 is lower than expected values", alarmMessages.get(0)
-                                                                                                      .getAlarmMessage());
+        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
     }
 
     @Test
@@ -243,16 +234,18 @@ public class RunningRuleTest {
         long timeInPeriod3 = 201808301438L;
         runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70));
         runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 71));
-        runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
 
         // check at 201808301440
         Assert.assertEquals(0, runningRule.check().size()); //check matches, no alarm
         runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
-        // check at 201808301441
-        Assert.assertEquals(0, runningRule.check().size()); //check matches, no alarm
-        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
+
+        runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod3, 74));
+
+        // check at 201808301440
+        Assert.assertEquals(1, runningRule.check().size()); //alarm
+        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
+
         // check at 201808301442
-        Assert.assertNotEquals(0, runningRule.check().size()); //alarm
         Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm
         Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm
         Assert.assertNotEquals(0, runningRule.check().size()); //alarm
@@ -304,7 +297,8 @@ public class RunningRuleTest {
         alarmRule.setThreshold("1000");
         alarmRule.setCount(1);
         alarmRule.setPeriod(10);
-        alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
+        alarmRule.setMessage(
+            "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
         alarmRule.setIncludeNamesRegex("Service\\_1(\\d)+");
         alarmRule.setTags(new HashMap<String, String>() {{
             put("key", "value");
@@ -338,7 +332,8 @@ public class RunningRuleTest {
         alarmRule.setThreshold("1000");
         alarmRule.setCount(1);
         alarmRule.setPeriod(10);
-        alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
+        alarmRule.setMessage(
+            "Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
         alarmRule.setExcludeNamesRegex("Service\\_2(\\d)+");
         alarmRule.setTags(new HashMap<String, String>() {{
             put("key", "value");
@@ -603,22 +598,16 @@ public class RunningRuleTest {
 
         runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod1, "50,17|99,11"));
         runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod2, "75,15|95,12"));
-        runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod3, "90,1|99,20"));
 
-        // check at 201808301440
         List<AlarmMessage> alarmMessages = runningRule.check();
         Assert.assertEquals(0, alarmMessages.size());
         runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
-        // check at 201808301441
-        alarmMessages = runningRule.check();
-        Assert.assertEquals(0, alarmMessages.size());
-        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
-        // check at 201808301442
+
+        runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod3, "90,1|99,20"));
+
+        // check at 201808301440
         alarmMessages = runningRule.check();
         Assert.assertEquals(1, alarmMessages.size());
-        Assert.assertEquals(
-            "response percentile of endpoint Service_123 is lower than expected value", alarmMessages.get(0)
-                .getAlarmMessage());
-
+        runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
     }
 }