You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/10/22 09:53:13 UTC

[doris] branch master updated: [improvement](heartbeat) Add some relaxation strategies to reduce the failure probability of regression testing (#13568)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 413d2332ce [improvement](heartbeat) Add some relaxation strategies to reduce the failure probability of regression testing (#13568)
413d2332ce is described below

commit 413d2332ce5a8bba423fd14ff6d56724ee6da385
Author: Mingyu Chen <mo...@gmail.com>
AuthorDate: Sat Oct 22 17:53:07 2022 +0800

    [improvement](heartbeat) Add some relaxation strategies to reduce the failure probability of regression testing (#13568)
    
    The regression test may failed because of heartbeat failure occasionally.
    So I add 2 new FE config to relax this limit
    
    1. `disable_backend_black_list`
        Set to true to not put Backend to black list even if we failed to send task to it. Default is false.
    2. `max_backend_heartbeat_failure_tolerance_count`
       Only if the failure time of heartbeat exceed this config, we can set Backend as dead. Default is 1.
---
 docs/en/docs/admin-manual/config/fe-config.md      | 22 ++++++++++++++++++
 docs/zh-CN/docs/admin-manual/config/fe-config.md   | 22 ++++++++++++++++++
 .../main/java/org/apache/doris/common/Config.java  | 17 ++++++++++++++
 .../apache/doris/common/proc/BackendsProcDir.java  |  3 +++
 .../main/java/org/apache/doris/qe/Coordinator.java |  2 +-
 .../java/org/apache/doris/qe/SimpleScheduler.java  |  5 ++++-
 .../main/java/org/apache/doris/system/Backend.java | 26 +++++++++++++++++++---
 .../doris/utframe/DemoMultiBackendsTest.java       |  5 +++--
 8 files changed, 95 insertions(+), 7 deletions(-)

diff --git a/docs/en/docs/admin-manual/config/fe-config.md b/docs/en/docs/admin-manual/config/fe-config.md
index 674437e670..a4de6ececb 100644
--- a/docs/en/docs/admin-manual/config/fe-config.md
+++ b/docs/en/docs/admin-manual/config/fe-config.md
@@ -2242,3 +2242,25 @@ Default: 100
 Is it possible to dynamically configure: true
 
 Is it a configuration item unique to the Master FE node: false
+
+### `disable_backend_black_list`
+
+Used to disable the BE blacklist function. After this function is disabled, if the query request to the BE fails, the BE will not be added to the blacklist.
+This parameter is suitable for regression testing environments to reduce occasional bugs that cause a large number of regression tests to fail.
+
+Default: false
+
+Is it possible to configure dynamically: true
+
+Is it a configuration item unique to the Master FE node: false
+
+### `max_backend_heartbeat_failure_tolerance_count`
+
+The maximum tolerable number of BE node heartbeat failures. If the number of consecutive heartbeat failures exceeds this value, the BE state will be set to dead.
+This parameter is suitable for regression test environments to reduce occasional heartbeat failures that cause a large number of regression test failures.
+
+Default: 1
+
+Is it possible to configure dynamically: true
+
+Whether it is a configuration item unique to the Master FE node: true
diff --git a/docs/zh-CN/docs/admin-manual/config/fe-config.md b/docs/zh-CN/docs/admin-manual/config/fe-config.md
index bc00e0d649..6608b58845 100644
--- a/docs/zh-CN/docs/admin-manual/config/fe-config.md
+++ b/docs/zh-CN/docs/admin-manual/config/fe-config.md
@@ -2297,3 +2297,25 @@ load 标签清理器将每隔 `label_clean_interval_second` 运行一次以清
 是否可以动态配置:true
 
 是否为 Master FE 节点独有的配置项:false
+
+### `disable_backend_black_list`
+
+用于禁止BE黑名单功能。禁止该功能后,如果向BE发送查询请求失败,也不会将这个BE添加到黑名单。
+该参数适用于回归测试环境,以减少偶发的错误导致大量回归测试失败。
+
+默认值:false
+
+是否可以动态配置:true
+
+是否为 Master FE 节点独有的配置项:false
+
+### `max_backend_heartbeat_failure_tolerance_count`
+
+最大可容忍的BE节点心跳失败次数。如果连续心跳失败次数超过这个值,则会将BE状态置为 dead。
+该参数适用于回归测试环境,以减少偶发的心跳失败导致大量回归测试失败。
+
+默认值:1
+
+是否可以动态配置:true
+
+是否为 Master FE 节点独有的配置项:true
diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/Config.java b/fe/fe-core/src/main/java/org/apache/doris/common/Config.java
index 871fd0470f..de1d8015ec 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/Config.java
@@ -1794,4 +1794,21 @@ public class Config extends ConfigBase {
     @ConfField(mutable = true, masterOnly = false)
     public static int max_query_profile_num = 100;
 
+    /**
+     * Set to true to disable backend black list, so that even if we failed to send task to a backend,
+     * that backend won't be added to black list.
+     * This should only be set when running tests, such as regression test.
+     * Highly recommended NOT disable it in product environment.
+     */
+    @ConfField(mutable = true, masterOnly = false)
+    public static boolean disable_backend_black_list = false;
+
+    /**
+     * Maximum backend heartbeat failure tolerance count.
+     * Default is 1, which means if 1 heart failed, the backend will be marked as dead.
+     * A larger value can improve the tolerance of the cluster to occasional heartbeat failures.
+     * For example, when running regression tests, this value can be increased.
+     */
+    @ConfField(mutable = true, masterOnly = true)
+    public static long max_backend_heartbeat_failure_tolerance_count = 1;
 }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/proc/BackendsProcDir.java b/fe/fe-core/src/main/java/org/apache/doris/common/proc/BackendsProcDir.java
index aa3c9821b0..096cf57c0b 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/proc/BackendsProcDir.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/proc/BackendsProcDir.java
@@ -52,6 +52,7 @@ public class BackendsProcDir implements ProcDirInterface {
             .add("SystemDecommissioned").add("ClusterDecommissioned").add("TabletNum")
             .add("DataUsedCapacity").add("AvailCapacity").add("TotalCapacity").add("UsedPct")
             .add("MaxDiskUsedPct").add("RemoteUsedCapacity").add("Tag").add("ErrMsg").add("Version").add("Status")
+            .add("HeartbeatFailureCounter")
             .build();
 
     public static final int HOSTNAME_INDEX = 3;
@@ -178,6 +179,8 @@ public class BackendsProcDir implements ProcDirInterface {
             backendInfo.add(backend.getVersion());
             // status
             backendInfo.add(new Gson().toJson(backend.getBackendStatus()));
+            // heartbeat failure counter
+            backendInfo.add(backend.getHeartbeatFailureCounter());
 
             comparableBackendInfos.add(backendInfo);
         }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java
index cd3296c354..13809d5633 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java
@@ -2150,7 +2150,7 @@ public class Coordinator {
         }
 
         public boolean isBackendStateHealthy() {
-            if (backend.getLastMissingHeartbeatTime() > lastMissingHeartbeatTime) {
+            if (backend.getLastMissingHeartbeatTime() > lastMissingHeartbeatTime && !backend.isAlive()) {
                 LOG.warn("backend {} is down while joining the coordinator. job id: {}",
                         backend.getId(), jobId);
                 return false;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java
index 1b1cc6bfa4..69f4408848 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java
@@ -18,6 +18,7 @@
 package org.apache.doris.qe;
 
 import org.apache.doris.catalog.Env;
+import org.apache.doris.common.Config;
 import org.apache.doris.common.FeConstants;
 import org.apache.doris.common.Pair;
 import org.apache.doris.common.Reference;
@@ -169,7 +170,9 @@ public class SimpleScheduler {
     }
 
     public static void addToBlacklist(Long backendID, String reason) {
-        if (backendID == null) {
+        if (backendID == null || Config.disable_backend_black_list) {
+            LOG.warn("ignore backend black list for backend: {}, disabled: {}", backendID,
+                    Config.disable_backend_black_list);
             return;
         }
 
diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
index fc5758374c..f705f2a2b1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
@@ -21,6 +21,7 @@ import org.apache.doris.alter.DecommissionType;
 import org.apache.doris.catalog.DiskInfo;
 import org.apache.doris.catalog.DiskInfo.DiskState;
 import org.apache.doris.catalog.Env;
+import org.apache.doris.common.Config;
 import org.apache.doris.common.FeConstants;
 import org.apache.doris.common.io.Text;
 import org.apache.doris.common.io.Writable;
@@ -128,6 +129,14 @@ public class Backend implements Writable {
     @SerializedName("tagMap")
     private Map<String, String> tagMap = Maps.newHashMap();
 
+    // Counter of heartbeat failure.
+    // Once a heartbeat failed, increase this counter by one.
+    // And if it reaches Config.max_backend_heartbeat_failure_tolerance_count, this backend
+    // will be marked as dead.
+    // And once it back to alive, reset this counter.
+    // No need to persist, because only master FE handle heartbeat.
+    private int heartbeatFailureCounter = 0;
+
     public Backend() {
         this.host = "";
         this.version = "";
@@ -333,6 +342,10 @@ public class Backend implements Writable {
         return backendStatus;
     }
 
+    public int getHeartbeatFailureCounter() {
+        return heartbeatFailureCounter;
+    }
+
     /**
      * backend belong to some cluster
      *
@@ -690,12 +703,19 @@ public class Backend implements Writable {
             }
 
             heartbeatErrMsg = "";
+            this.heartbeatFailureCounter = 0;
         } else {
-            if (isAlive.compareAndSet(true, false)) {
-                isChanged = true;
-                LOG.warn("{} is dead,", this.toString());
+            // Only set backend to dead if the heartbeat failure counter exceed threshold.
+            if (++this.heartbeatFailureCounter >= Config.max_backend_heartbeat_failure_tolerance_count) {
+                if (isAlive.compareAndSet(true, false)) {
+                    isChanged = true;
+                    LOG.warn("{} is dead,", this.toString());
+                }
             }
 
+            // still set error msg and missing time even if we may not mark this backend as dead,
+            // for debug easily.
+            // But notice that if isChanged = false, these msg will not sync to other FE.
             heartbeatErrMsg = hbResponse.getMsg() == null ? "Unknown error" : hbResponse.getMsg();
             lastMissingHeartbeatTime = System.currentTimeMillis();
         }
diff --git a/fe/fe-core/src/test/java/org/apache/doris/utframe/DemoMultiBackendsTest.java b/fe/fe-core/src/test/java/org/apache/doris/utframe/DemoMultiBackendsTest.java
index 7f619f8267..860abe35e2 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/utframe/DemoMultiBackendsTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/utframe/DemoMultiBackendsTest.java
@@ -199,8 +199,9 @@ public class DemoMultiBackendsTest {
         ProcResult result = dir.fetchResult();
         Assert.assertEquals(BackendsProcDir.TITLE_NAMES.size(), result.getColumnNames().size());
         Assert.assertEquals("{\"location\" : \"default\"}", result.getRows().get(0).get(20));
-        Assert.assertEquals("{\"lastSuccessReportTabletsTime\":\"N/A\",\"lastStreamLoadTime\":-1,\"isQueryDisabled\":false,\"isLoadDisabled\":false}",
-                result.getRows().get(0).get(BackendsProcDir.TITLE_NAMES.size() - 1));
+        Assert.assertEquals(
+                "{\"lastSuccessReportTabletsTime\":\"N/A\",\"lastStreamLoadTime\":-1,\"isQueryDisabled\":false,\"isLoadDisabled\":false}",
+                result.getRows().get(0).get(BackendsProcDir.TITLE_NAMES.size() - 2));
     }
 
     private static void updateReplicaPathHash() {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org