You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2021/12/09 14:35:18 UTC

[incubator-doris] branch master updated: [improvement](compaction)(tablet repair) Add missing rowsets in compaction status url and support force dropping redundant replica (#7283)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new db57c42  [improvement](compaction)(tablet repair) Add missing rowsets in compaction status url and support force dropping redundant replica (#7283)
db57c42 is described below

commit db57c42c8383a94205161a705184e157face93c9
Author: Mingyu Chen <mo...@gmail.com>
AuthorDate: Thu Dec 9 22:34:57 2021 +0800

    [improvement](compaction)(tablet repair) Add missing rowsets in compaction status url and support force dropping redundant replica (#7283)
    
    1. Add missing rowsets in compaction status url
    2. Add a new config `force_drop_redundant_replica` to force drop redundant replicas.
    3. Fix FE ut
---
 be/src/olap/tablet.cpp                              | 11 +++++++++++
 be/src/olap/tablet.h                                |  3 +++
 be/src/olap/version_graph.cpp                       |  4 ++--
 docs/.vuepress/sidebar/en.js                        |  6 +++---
 docs/.vuepress/sidebar/zh-CN.js                     |  4 ++--
 docs/en/administrator-guide/config/fe_config.md     | 11 +++++++++++
 .../http-actions/compaction-action.md               |  2 ++
 .../commit-format-specification.md                  |  2 ++
 docs/zh-CN/administrator-guide/config/fe_config.md  | 11 +++++++++++
 .../http-actions/compaction-action.md               |  2 ++
 .../commit-format-specification.md                  |  4 +++-
 .../apache/doris/catalog/TabletInvertedIndex.java   |  5 +++--
 .../org/apache/doris/clone/TabletScheduler.java     | 21 +++++++++++----------
 .../main/java/org/apache/doris/common/Config.java   | 11 +++++++++++
 .../java/org/apache/doris/mysql/MysqlChannel.java   |  4 ++--
 .../org/apache/doris/mysql/MysqlChannelTest.java    | 10 +++++-----
 16 files changed, 84 insertions(+), 27 deletions(-)

diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp
index 7a77448..5be3837 100644
--- a/be/src/olap/tablet.cpp
+++ b/be/src/olap/tablet.cpp
@@ -836,6 +836,7 @@ void Tablet::_max_continuous_version_from_beginning_unlocked(Version* version,
                   // simple because 2 versions are certainly not overlapping
                   return left.first.first < right.first.first;
               });
+
     Version max_continuous_version = {-1, 0};
     VersionHash max_continuous_version_hash = 0;
     for (int i = 0; i < existing_versions.size(); ++i) {
@@ -1108,9 +1109,17 @@ void Tablet::get_compaction_status(std::string* json_result) {
 
     // print all rowsets' version as an array
     rapidjson::Document versions_arr;
+    rapidjson::Document missing_versions_arr;
     versions_arr.SetArray();
+    missing_versions_arr.SetArray();
+    int64_t last_version = -1;
     for (int i = 0; i < rowsets.size(); ++i) {
         const Version& ver = rowsets[i]->version();
+        if (ver.first != last_version + 1) {
+            rapidjson::Value miss_value;
+            miss_value.SetString(strings::Substitute("[$0-$1]", last_version + 1, ver.first).c_str(), missing_versions_arr.GetAllocator());
+            missing_versions_arr.PushBack(miss_value, missing_versions_arr.GetAllocator());
+        }
         rapidjson::Value value;
         std::string disk_size =
                 PrettyPrinter::print(rowsets[i]->rowset_meta()->total_disk_size(), TUnit::BYTES);
@@ -1121,8 +1130,10 @@ void Tablet::get_compaction_status(std::string* json_result) {
                 rowsets[i]->rowset_id().to_string(), disk_size);
         value.SetString(version_str.c_str(), version_str.length(), versions_arr.GetAllocator());
         versions_arr.PushBack(value, versions_arr.GetAllocator());
+        last_version = ver.second;
     }
     root.AddMember("rowsets", versions_arr, root.GetAllocator());
+    root.AddMember("missing_rowsets", missing_versions_arr, root.GetAllocator());
 
     // print all stale rowsets' version as an array
     rapidjson::Document stale_versions_arr;
diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h
index d9c786a..42e703a 100644
--- a/be/src/olap/tablet.h
+++ b/be/src/olap/tablet.h
@@ -266,6 +266,9 @@ private:
     void _print_missed_versions(const std::vector<Version>& missed_versions) const;
     bool _contains_rowset(const RowsetId rowset_id);
     OLAPStatus _contains_version(const Version& version);
+
+    // Returns:
+    // version: the max continuous version from beginning
     void _max_continuous_version_from_beginning_unlocked(Version* version,
                                                          VersionHash* v_hash) const;
     RowsetSharedPtr _rowset_with_largest_size();
diff --git a/be/src/olap/version_graph.cpp b/be/src/olap/version_graph.cpp
index 336f4fb..6e94809 100644
--- a/be/src/olap/version_graph.cpp
+++ b/be/src/olap/version_graph.cpp
@@ -606,8 +606,8 @@ OLAPStatus VersionGraph::capture_consistent_versions(const Version& spec_version
             }
             cur_idx = next_idx;
         } else {
-            LOG(WARNING) << "fail to find path in version_graph. "
-                         << "spec_version: " << spec_version.first << "-" << spec_version.second;
+            VLOG_NOTICE << "fail to find path in version_graph. "
+                        << "spec_version: " << spec_version.first << "-" << spec_version.second;
             return OLAP_ERR_VERSION_NOT_EXIST;
         }
     }
diff --git a/docs/.vuepress/sidebar/en.js b/docs/.vuepress/sidebar/en.js
index caf85b9..1f00708 100644
--- a/docs/.vuepress/sidebar/en.js
+++ b/docs/.vuepress/sidebar/en.js
@@ -625,12 +625,11 @@ module.exports = [
         "benchmark-tool",
         "fe-eclipse-dev",
         "fe-idea-dev",
-        "be-vscode-dev",		
+        "be-vscode-dev",        
         "java-format-code",
         "cpp-format-code",
         "How-to-Share-blogs",
-	"commit-format-specification",
-		"minidump"
+        "minidump"
     ],
   },
   {
@@ -650,6 +649,7 @@ module.exports = [
       "feedback",
       "how-to-contribute",
       "committer-guide",
+      "commit-format-specification",
       "pull-request",
       "release-process",
       "verify-apache-release",
diff --git a/docs/.vuepress/sidebar/zh-CN.js b/docs/.vuepress/sidebar/zh-CN.js
index df2df5c..9e11e7d 100644
--- a/docs/.vuepress/sidebar/zh-CN.js
+++ b/docs/.vuepress/sidebar/zh-CN.js
@@ -633,8 +633,7 @@ module.exports = [
         "java-format-code",
         "cpp-format-code",
         "How-to-Share-blogs",
-	"commit-format-specification",
-		"minidump"
+        "minidump"
     ],
   },
   {
@@ -654,6 +653,7 @@ module.exports = [
       "feedback",
       "how-to-contribute",
       "committer-guide",
+      "commit-format-specification",
       "pull-request",
       "release-process",
       "verify-apache-release",
diff --git a/docs/en/administrator-guide/config/fe_config.md b/docs/en/administrator-guide/config/fe_config.md
index 1135f4c..17b5b2f 100644
--- a/docs/en/administrator-guide/config/fe_config.md
+++ b/docs/en/administrator-guide/config/fe_config.md
@@ -2082,3 +2082,14 @@ Default: true
 
 Whether to use compressed format to send query plan structure. After it is turned on, the size of the query plan structure can be reduced by about 50%, thereby avoiding some "send fragment timeout" errors.
 However, in some high-concurrency small query scenarios, the concurrency may be reduced by about 10%.
+
+### force_drop_redundant_replica
+
+Default: false
+
+Dynamically configured: true
+
+Only for Master FE: true
+
+If set to true, the system will immediately drop redundant replicas in the tablet scheduling logic. This may cause some load jobs that are writing to the corresponding replica to fail, but it will speed up the balance and repair speed of the tablet.
+When there are a large number of replicas waiting to be balanced or repaired in the cluster, you can try to set this config to speed up the balance and repair of replicas at the expense of partial load success rate.
diff --git a/docs/en/administrator-guide/http-actions/compaction-action.md b/docs/en/administrator-guide/http-actions/compaction-action.md
index 248c98b..6177151 100644
--- a/docs/en/administrator-guide/http-actions/compaction-action.md
+++ b/docs/en/administrator-guide/http-actions/compaction-action.md
@@ -84,6 +84,7 @@ If the tablet exists, the result is returned in JSON format:
         "[50-50] 0 DELETE NONOVERLAPPING 574.00 B",
         "[51-51] 5 DATA OVERLAPPING 574.00 B"
     ],
+    "missing_rowsets": [],
     "stale version path": [
         {
             "path id": "2",
@@ -106,6 +107,7 @@ Explanation of results:
 * last cumulative failure time: The time when the last cumulative compaction failed. After 10 minutes by default, cumulative compaction is attempted on the this tablet again.
 * last base failure time: The time when the last base compaction failed. After 10 minutes by default, base compaction is attempted on the this tablet again.
 * rowsets: The current rowsets collection of this tablet. [0-48] means a rowset with version 0-48. The second number is the number of segments in a rowset. The `DELETE` indicates the delete version. `OVERLAPPING` and `NONOVERLAPPING` indicates whether data between segments is overlap.
+* missing_rowset: The missing rowsets.
 * stale version path: The merged version path of the rowset collection currently merged in the tablet. It is an array structure and each element represents a merged path. Each element contains three attributes: path id indicates the version path id, and last create time indicates the creation time of the most recent rowset on the path. By default, all rowsets on this path will be deleted after half an hour at the last create time.
 
 ### Examples
diff --git a/docs/en/developer-guide/commit-format-specification.md b/docs/en/community/commit-format-specification.md
similarity index 97%
rename from docs/en/developer-guide/commit-format-specification.md
rename to docs/en/community/commit-format-specification.md
index c41ea0a..05b21a9 100644
--- a/docs/en/developer-guide/commit-format-specification.md
+++ b/docs/en/community/commit-format-specification.md
@@ -41,6 +41,7 @@ Commit is divided into ‘ title ’ and ‘ content ’ , the title should be l
         
         * fix: Bug fix
         * feature: New feature
+        * feature-wip: Feature works-in-porgress.
         * improvement: Optimization and improvement for the original feature. 
         * docs: Documents
         * style: Code style adjustment
@@ -51,6 +52,7 @@ Commit is divided into ‘ title ’ and ‘ content ’ , the title should be l
         * chore: Modification of build tool
         * revert: Revert a previous commit
         * deps: Modification of third-party dependency Library
+        * community: Such as modification of Github issue template.
 
         Some tips:
         
diff --git a/docs/zh-CN/administrator-guide/config/fe_config.md b/docs/zh-CN/administrator-guide/config/fe_config.md
index 706bb45..4a12591 100644
--- a/docs/zh-CN/administrator-guide/config/fe_config.md
+++ b/docs/zh-CN/administrator-guide/config/fe_config.md
@@ -2102,3 +2102,14 @@ load 标签清理器将每隔 `label_clean_interval_second` 运行一次以清
 是否为 Master FE 节点独有的配置项:false
 
 如果设置为true,将关闭副本修复和均衡逻辑。
+
+### force_drop_redundant_replica
+
+默认值:false
+
+是否可以动态配置:true
+
+是否为 Master FE 节点独有的配置项:false
+
+如果设置为true,系统会在副本调度逻辑中,立即删除冗余副本。这可能导致部分正在对对应副本写入的导入作业失败,但是会加速副本的均衡和修复速度。
+当集群中有大量等待被均衡或修复的副本时,可以尝试设置此参数,以牺牲部分导入成功率为代价,加速副本的均衡和修复。
diff --git a/docs/zh-CN/administrator-guide/http-actions/compaction-action.md b/docs/zh-CN/administrator-guide/http-actions/compaction-action.md
index aa58c8c..f04cf1b 100644
--- a/docs/zh-CN/administrator-guide/http-actions/compaction-action.md
+++ b/docs/zh-CN/administrator-guide/http-actions/compaction-action.md
@@ -84,6 +84,7 @@ curl -X GET http://be_host:webserver_port/api/compaction/show?tablet_id=xxxx\&sc
         "[50-50] 0 DELETE NONOVERLAPPING 574.00 B",
         "[51-51] 5 DATA OVERLAPPING 574.00 B"
     ],
+    "missing_rowsets": [],
     "stale version path": [
         {
             "path id": "2",
@@ -106,6 +107,7 @@ curl -X GET http://be_host:webserver_port/api/compaction/show?tablet_id=xxxx\&sc
 * last cumulative failure time:上一次尝试 cumulative compaction 失败的时间。默认 10min 后才会再次尝试对该 tablet 做 cumulative compaction。
 * last base failure time:上一次尝试 base compaction 失败的时间。默认 10min 后才会再次尝试对该 tablet 做 base compaction。
 * rowsets:该 tablet 当前的 rowset 集合。如 [0-48] 表示 0-48 版本。第二位数字表示该版本中 segment 的数量。`DELETE` 表示 delete 版本。`DATA` 表示数据版本。`OVERLAPPING` 和 `NONOVERLAPPING` 表示segment数据是否重叠。
+* missing_rowsets: 缺失的版本。
 * stale version path:该 table 当前被合并rowset集合的合并版本路径,该结构是一个数组结构,每个元素表示一个合并路径。每个元素中包含了三个属性:path id 表示版本路径id,last create time 表示当前路径上最近的 rowset 创建时间,默认在这个时间半个小时之后这条路径上的所有 rowset 会被过期删除。
 
 ### 示例
diff --git a/docs/zh-CN/developer-guide/commit-format-specification.md b/docs/zh-CN/community/commit-format-specification.md
similarity index 96%
rename from docs/zh-CN/developer-guide/commit-format-specification.md
rename to docs/zh-CN/community/commit-format-specification.md
index 5953067..224dedb 100644
--- a/docs/zh-CN/developer-guide/commit-format-specification.md
+++ b/docs/zh-CN/community/commit-format-specification.md
@@ -39,6 +39,7 @@ Commit 分为“标题”和“内容”。原则上标题全部小写。内容
         
         * fix:bug修复
         * feature:新增功能
+        * feature-wip:开发中的功能,比如某功能的部分代码。
         * improvement:原有功能的优化和改进
         * docs:文档
         * style:代码风格调整
@@ -49,6 +50,7 @@ Commit 分为“标题”和“内容”。原则上标题全部小写。内容
         * chore:构建工具的修改
         * revert:回滚
         * deps:第三方依赖库的修改
+        * community:社区相关的修改,如修改 Github Issue 模板等。
 
         几点说明:
         
@@ -155,4 +157,4 @@ Commit 分为“标题”和“内容”。原则上标题全部小写。内容
     
     In my test, loading a table with 48 buckets, mem limit 2G, in previous version,
     the average memtable size is 44MB, after modification, the average size is 82MB
-    ```
\ No newline at end of file
+    ```
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletInvertedIndex.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletInvertedIndex.java
index 4bf3b0f..1942894 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletInvertedIndex.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletInvertedIndex.java
@@ -270,9 +270,10 @@ public class TabletInvertedIndex {
         long end = System.currentTimeMillis();
         LOG.info("finished to do tablet diff with backend[{}]. sync: {}. metaDel: {}. foundValid: {}. foundInvalid: {}."
                         + " migration: {}. found invalid transactions {}. found republish transactions {}. tabletInMemorySync: {}."
-                        + " cost: {} ms", backendId, tabletSyncMap.size(),
+                        + " need recovery: {}. cost: {} ms", backendId, tabletSyncMap.size(),
                 tabletDeleteFromMeta.size(), foundTabletsWithValidSchema.size(), foundTabletsWithInvalidSchema.size(),
-                tabletMigrationMap.size(), transactionsToClear.size(), transactionsToPublish.size(), tabletToInMemory.size(), (end - start));
+                tabletMigrationMap.size(), transactionsToClear.size(), transactionsToPublish.size(), tabletToInMemory.size(),
+                tabletRecoveryMap.size(), (end - start));
     }
 
     public Long getTabletIdByReplica(long replicaId) {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
index 2fcc821..f66e42d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
@@ -181,7 +181,7 @@ public class TabletScheduler extends MasterDaemon {
         for (Long beId : backendsWorkingSlots.keySet()) {
             if (backends.containsKey(beId)) {
                 List<Long> pathHashes = backends.get(beId).getDisks().values().stream()
-                        .filter(v -> v.getState()==DiskState.ONLINE)
+                        .filter(v -> v.getState() == DiskState.ONLINE)
                         .map(DiskInfo::getPathHash).collect(Collectors.toList());
                 backendsWorkingSlots.get(beId).updatePaths(pathHashes);
             } else {
@@ -529,7 +529,7 @@ public class TabletScheduler extends MasterDaemon {
                 // executing an alter job, but the alter job is in a PENDING state and is waiting for
                 // the table to become stable. In this case, we allow the tablet repair to proceed.
                 throw new SchedException(Status.UNRECOVERABLE,
-                    "table is in alter process, but tablet status is " + statusPair.first.name());
+                        "table is in alter process, but tablet status is " + statusPair.first.name());
             }
 
             tabletCtx.setTabletStatus(statusPair.first);
@@ -912,7 +912,7 @@ public class TabletScheduler extends MasterDaemon {
     }
 
     private boolean deleteFromHighLoadBackend(TabletSchedCtx tabletCtx, List<Replica> replicas,
-            boolean force, ClusterLoadStatistic statistic) throws SchedException {
+                                              boolean force, ClusterLoadStatistic statistic) throws SchedException {
         Replica chosenReplica = null;
         double maxScore = 0;
         for (Replica replica : replicas) {
@@ -976,7 +976,8 @@ public class TabletScheduler extends MasterDaemon {
          * 2. Wait for any txns before the watermark txn id to be finished. If all are finished, which means this replica is
          *      safe to be deleted.
          */
-        if (!force && replica.getState().canLoad() && replica.getWatermarkTxnId() == -1 && !FeConstants.runningUnitTest) {
+        if (!force && !Config.enable_force_drop_redundant_replica && replica.getState().canLoad()
+                && replica.getWatermarkTxnId() == -1 && !FeConstants.runningUnitTest) {
             long nextTxnId = Catalog.getCurrentGlobalTransactionMgr().getTransactionIDGenerator().getNextTransactionId();
             replica.setWatermarkTxnId(nextTxnId);
             replica.setState(ReplicaState.DECOMMISSION);
@@ -1011,11 +1012,11 @@ public class TabletScheduler extends MasterDaemon {
 
         // write edit log
         ReplicaPersistInfo info = ReplicaPersistInfo.createForDelete(tabletCtx.getDbId(),
-                                                                     tabletCtx.getTblId(),
-                                                                     tabletCtx.getPartitionId(),
-                                                                     tabletCtx.getIndexId(),
-                                                                     tabletCtx.getTabletId(),
-                                                                     replica.getBackendId());
+                tabletCtx.getTblId(),
+                tabletCtx.getPartitionId(),
+                tabletCtx.getIndexId(),
+                tabletCtx.getTabletId(),
+                replica.getBackendId());
 
         Catalog.getCurrentCatalog().getEditLog().logDeleteReplica(info);
 
@@ -1103,7 +1104,7 @@ public class TabletScheduler extends MasterDaemon {
             addTablet(tabletCtx, false);
         }
     }
- 
+
     /**
      * Try to create a balance task for a tablet.
      */
diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/Config.java b/fe/fe-core/src/main/java/org/apache/doris/common/Config.java
index fe13d9b..ae4fedf 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/Config.java
@@ -1544,4 +1544,15 @@ public class Config extends ConfigBase {
      */
     @ConfField(mutable = true, masterOnly = true)
     public static boolean disable_tablet_scheduler = false;
+
+    /*
+     * When doing clone or repair tablet task, there may be replica is REDUNDANT state, which
+     * should be dropped later. But there are be loading task on these replicas, so the default strategy
+     * is to wait until the loading task finished before dropping them.
+     * But the default strategy may takes very long time to handle these redundant replicas.
+     * So we can set this config to true to not wait any loading task.
+     * Set this config to true may cause loading task failed, but will speed up the process of tablet balance and repair.
+     */
+    @ConfField(mutable = true, masterOnly = true)
+    public static boolean enable_force_drop_redundant_replica = false;
 }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/MysqlChannel.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/MysqlChannel.java
index f9d00e1..df7f1d8 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/mysql/MysqlChannel.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/MysqlChannel.java
@@ -33,7 +33,7 @@ import java.nio.channels.SocketChannel;
 public class MysqlChannel {
     // max length which one MySQL physical can hold, if one logical packet is bigger than this,
     // one packet will split to many packets
-    protected static final int MAX_PHYSICAL_PACKET_LENGTH = 0xffffff;
+    public static final int MAX_PHYSICAL_PACKET_LENGTH = 0xffffff;
     // MySQL packet header length
     protected static final int PACKET_HEADER_LEN = 4;
     // logger for this class
@@ -60,7 +60,7 @@ public class MysqlChannel {
         this.remoteHostPortString = "";
         this.remoteIp = "";
     }
-    
+
     public MysqlChannel(SocketChannel channel) {
         this.sequenceId = 0;
         this.channel = channel;
diff --git a/fe/fe-core/src/test/java/org/apache/doris/mysql/MysqlChannelTest.java b/fe/fe-core/src/test/java/org/apache/doris/mysql/MysqlChannelTest.java
index a2a05a6..017f15d 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/mysql/MysqlChannelTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/mysql/MysqlChannelTest.java
@@ -17,10 +17,6 @@
 
 package org.apache.doris.mysql;
 
-import mockit.Delegate;
-import mockit.Expectations;
-import mockit.Mocked;
-
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
@@ -30,6 +26,10 @@ import java.net.InetSocketAddress;
 import java.nio.ByteBuffer;
 import java.nio.channels.SocketChannel;
 
+import mockit.Delegate;
+import mockit.Expectations;
+import mockit.Mocked;
+
 public class MysqlChannelTest {
     int packetId = 0;
     int readIdx = 0;
@@ -297,4 +297,4 @@ public class MysqlChannelTest {
         Assert.fail("No Exception throws.");
     }
 
-}
\ No newline at end of file
+}

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org