You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by ya...@apache.org on 2021/09/29 03:11:41 UTC

[incubator-doris] branch master updated: [Bug] Fix bdbje getDatabaseNames() bug and scan node close bug (#6769)

This is an automated email from the ASF dual-hosted git repository.

yangzhg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new ad3c939  [Bug] Fix bdbje getDatabaseNames() bug and scan node close bug (#6769)
ad3c939 is described below

commit ad3c9390a26c9f5792f6992d4a0f1123c8d8594e
Author: Mingyu Chen <mo...@gmail.com>
AuthorDate: Wed Sep 29 11:11:28 2021 +0800

    [Bug] Fix bdbje getDatabaseNames() bug and scan node close bug (#6769)
    
    1. This bug is introduced from #6582
    2. Optimize the error log of Address used used error msg.
    3. Add some document about compilation.
        1. Add a custom thirdparty download url.
        2. Add a custom com.alibaba maven jar package for DataX.
    4. Fix bug that BE crash when closing scan node, introduced from #6622.
---
 be/src/exec/olap_scan_node.cpp                     |   7 +-
 .../routine_load/routine_load_task_executor.cpp    |   4 +
 docs/en/extending-doris/datax.md                   |  16 ++-
 docs/en/installing/compilation.md                  |  30 ++++--
 docs/zh-CN/extending-doris/datax.md                |  16 ++-
 docs/zh-CN/installing/compilation.md               |  18 ++++
 .../src/main/java/org/apache/doris/PaloFe.java     |  31 +++++-
 .../java/org/apache/doris/catalog/Catalog.java     |   5 +-
 .../org/apache/doris/common/util/NetUtils.java     |  41 +++++++-
 .../apache/doris/journal/bdbje/BDBJEJournal.java   | 111 ++++++++++-----------
 .../java/org/apache/doris/master/Checkpoint.java   |  10 +-
 11 files changed, 207 insertions(+), 82 deletions(-)

diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp
index 6378fa2..2871b29 100644
--- a/be/src/exec/olap_scan_node.cpp
+++ b/be/src/exec/olap_scan_node.cpp
@@ -372,8 +372,11 @@ Status OlapScanNode::close(RuntimeState* state) {
     _row_batch_added_cv.notify_all();
     _scan_batch_added_cv.notify_all();
 
-    // join transfer thread
-    _transfer_thread->join();
+	// _transfer_thread
+	// _transfer_thread may not be initialized. So need to check it
+	if (_transfer_thread != nullptr) {
+		_transfer_thread->join();
+	}
 
     // clear some row batch in queue
     for (auto row_batch : _materialized_row_batches) {
diff --git a/be/src/runtime/routine_load/routine_load_task_executor.cpp b/be/src/runtime/routine_load/routine_load_task_executor.cpp
index 17d13a4..0e7ae45 100644
--- a/be/src/runtime/routine_load/routine_load_task_executor.cpp
+++ b/be/src/runtime/routine_load/routine_load_task_executor.cpp
@@ -351,6 +351,7 @@ void RoutineLoadTaskExecutor::err_handler(StreamLoadContext* ctx, const Status&
 // for test only
 Status RoutineLoadTaskExecutor::_execute_plan_for_test(StreamLoadContext* ctx) {
     auto mock_consumer = [this, ctx]() {
+		ctx->ref();
         std::shared_ptr<StreamLoadPipe> pipe = _exec_env->load_stream_mgr()->get(ctx->id);
         bool eof = false;
         std::stringstream ss;
@@ -378,6 +379,9 @@ Status RoutineLoadTaskExecutor::_execute_plan_for_test(StreamLoadContext* ctx) {
                 ss << one;
             }
         }
+        if (ctx->unref()) {
+            delete ctx;
+        }
     };
 
     std::thread t1(mock_consumer);
diff --git a/docs/en/extending-doris/datax.md b/docs/en/extending-doris/datax.md
index 838e309..1776f87 100644
--- a/docs/en/extending-doris/datax.md
+++ b/docs/en/extending-doris/datax.md
@@ -83,8 +83,22 @@ Because the doriswriter plug-in depends on some modules in the DataX code base,
 
         > hdfsreader, hdfswriter and oscarwriter needs some extra jar packages. If you don't need to use these components, you can comment out the corresponding module in DataX/pom.xml.
 
+	3. Compilation error
+
+        If you encounter the following compilation errors:
+
+        ```
+        Could not find artifact com.alibaba.datax:datax-all:pom:0.0.1-SNAPSHOT ...
+        ```
+
+        You can try the following solutions:
+
+        1. Download [alibaba-datax-maven-m2-20210928.tar.gz](https://doris-thirdparty-repo.bj.bcebos.com/thirdparty/alibaba-datax-maven-m2-20210928.tar.gz)
+        2. After decompression, copy the resulting `alibaba/datax/` directory to `.m2/repository/com/alibaba/` corresponding to the maven used.
+        3. Try to compile again.
+
 4. Commit code of doriswriter in `doriswriter` if you need.
 
 ### Example
 
-For instructions on using the doriswriter plug-in, please refer to [here](https://github.com/apache/incubator-doris/blob/master/extension/DataX/doriswriter/doc/doriswriter.md).
\ No newline at end of file
+For instructions on using the doriswriter plug-in, please refer to [here](https://github.com/apache/incubator-doris/blob/master/extension/DataX/doriswriter/doc/doriswriter.md).
diff --git a/docs/en/installing/compilation.md b/docs/en/installing/compilation.md
index ecac70d..3609452 100644
--- a/docs/en/installing/compilation.md
+++ b/docs/en/installing/compilation.md
@@ -35,9 +35,9 @@ This document focuses on how to code Doris through source code.
 
 1. Download Docker Mirror
 
-	`$ docker pull apache/incubator-doris:build-env-1.3.1`
+    `$ docker pull apache/incubator-doris:build-env-1.3.1`
 
-	Check mirror download completed:
+    Check mirror download completed:
 
     ```
     $ docker images
@@ -78,7 +78,7 @@ Note: For different versions of Doris, you need to download the corresponding mi
 
 2. Running Mirror
 
-	`$ docker run -it apache/incubator-doris:build-env-1.3.1`
+    `$ docker run -it apache/incubator-doris:build-env-1.3.1`
 
     It is recommended to run the container by mounting the local Doris source directory, so that the compiled binary file will be stored in the host machine and will not disappear because the container exits.
 
@@ -90,7 +90,7 @@ Note: For different versions of Doris, you need to download the corresponding mi
 
 3. Download source code
 
-	After starting the mirror, you should be in the container. The Doris source code can be downloaded from the following command (local source directory mounted is not required):
+    After starting the mirror, you should be in the container. The Doris source code can be downloaded from the following command (local source directory mounted is not required):
 
     ```
     $ wget https://dist.apache.org/repos/dist/dev/incubator/doris/xxx.tar.gz
@@ -104,7 +104,7 @@ Note: For different versions of Doris, you need to download the corresponding mi
     $ sh build.sh
     ```
 
-	After compilation, the output file is in the `output/` directory.
+    After compilation, the output file is in the `output/` directory.
 
 ### Self-compiling Development Environment Mirror
 
@@ -166,13 +166,31 @@ You can try to compile Doris directly in your own Linux environment.
     ```
     $ sh build.sh
     ```
-	After compilation, the output file is in the `output/` directory.
+    After compilation, the output file is in the `output/` directory.
 
 ## FAQ
 
 1. `Could not transfer artifact net.sourceforge.czt.dev:cup-maven-plugin:pom:1.6-cdh from/to xxx`
 
     If you encounter the above error, please refer to [PR #4769](https://github.com/apache/incubator-doris/pull/4769/files) to modify the cloudera-related repo configuration in `fe/pom.xml`.
+
+2. The third party relies on download connection errors, failures, etc.
+
+     The download links of the third-party libraries that Doris relies on are all in the `thirdparty/vars.sh` file. Over time, some download connections may fail. If you encounter this situation. It can be solved in the following two ways:
+
+     1. Manually modify the `thirdparty/vars.sh` file
+
+         Manually modify the problematic download connection and the corresponding MD5 value.
+
+     2. Use a third-party download warehouse:
+
+         ```
+         export REPOSITORY_URL=https://doris-thirdparty-repo.bj.bcebos.com/thirdparty
+         sh build-thirdparty.sh
+         ```
+
+         REPOSITORY_URL contains all third-party library source code packages and their historical versions.
+
 ## Special statement
 
 Starting from version 0.13, the dependency on the two third-party libraries [1] and [2] will be removed in the default compiled output. These two third-party libraries are under [GNU General Public License V3](https://www.gnu.org/licenses/gpl-3.0.en.html). This license is incompatible with [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0), so it should not appear in the Apache release by default.
diff --git a/docs/zh-CN/extending-doris/datax.md b/docs/zh-CN/extending-doris/datax.md
index fdc94f1..975fa7e 100644
--- a/docs/zh-CN/extending-doris/datax.md
+++ b/docs/zh-CN/extending-doris/datax.md
@@ -83,8 +83,22 @@ doriswriter 插件依赖的 DataX 代码中的一些模块。而这些模块并
 
         > hdfsreader, hdfswriter and oscarwriter 这三个插件需要额外的jar包。如果你并不需要这些插件,可以在 `DataX/pom.xml` 中删除这些插件的模块。
 
+	3. 编译错误
+
+		如遇到如下编译错误:
+
+		```
+		Could not find artifact com.alibaba.datax:datax-all:pom:0.0.1-SNAPSHOT ...
+		```
+
+		可尝试以下方式解决:
+
+		1. 下载 [alibaba-datax-maven-m2-20210928.tar.gz](https://doris-thirdparty-repo.bj.bcebos.com/thirdparty/alibaba-datax-maven-m2-20210928.tar.gz)
+		2. 解压后,将得到的 `alibaba/datax/` 目录,拷贝到所使用的 maven 对应的 `.m2/repository/com/alibaba/` 下。
+		3. 再次尝试编译。
+
 4. 按需提交修改。
 
 ### 示例
 
-doriswriter 插件的使用说明请参阅 [这里](https://github.com/apache/incubator-doris/blob/master/extension/DataX/doriswriter/doc/doriswriter.md)
\ No newline at end of file
+doriswriter 插件的使用说明请参阅 [这里](https://github.com/apache/incubator-doris/blob/master/extension/DataX/doriswriter/doc/doriswriter.md)
diff --git a/docs/zh-CN/installing/compilation.md b/docs/zh-CN/installing/compilation.md
index c932134..6847907 100644
--- a/docs/zh-CN/installing/compilation.md
+++ b/docs/zh-CN/installing/compilation.md
@@ -159,6 +159,7 @@ under the License.
    
     安装完成后,自行设置环境变量 `PATH`, `JAVA_HOME` 等。
     注意: Doris 0.14.0 的版本仍然使用gcc7 的依赖编译,之后的代码将使用gcc10 的依赖
+
 2. 编译 Doris
 
     ```
@@ -173,6 +174,23 @@ under the License.
 
     如遇到上述错误,请参照 [PR #4769](https://github.com/apache/incubator-doris/pull/4769/files) 修改 `fe/pom.xml` 中 cloudera 相关的仓库配置。
 
+2. 第三方依赖下载连接错误、失效等问题
+
+    Doris 所依赖的第三方库的下载连接都在 `thirdparty/vars.sh` 文件内。随着时间推移,一些下载连接可能会失效。如果遇到这种情况。可以使用如下两种方式解决:
+
+    1. 手动修改 `thirdparty/vars.sh` 文件
+
+        手动修改有问题的下载连接和对应的 MD5 值。
+
+    2. 使用第三方下载仓库:
+
+        ```
+        export REPOSITORY_URL=https://doris-thirdparty-repo.bj.bcebos.com/thirdparty
+        sh build-thirdparty.sh
+        ```
+
+        REPOSITORY_URL 中包含所有第三方库源码包和他们的历史版本。
+
 ## 特别声明
 
 自 0.13 版本开始,默认的编译产出中将取消对 [1] 和 [2] 两个第三方库的依赖。这两个第三方库为 [GNU General Public License V3](https://www.gnu.org/licenses/gpl-3.0.en.html) 协议。该协议与 [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0) 协议不兼容,因此默认不出现在 Apache 发布版本中。
diff --git a/fe/fe-core/src/main/java/org/apache/doris/PaloFe.java b/fe/fe-core/src/main/java/org/apache/doris/PaloFe.java
index c1aa1e9..713e4b6 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/PaloFe.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/PaloFe.java
@@ -25,6 +25,7 @@ import org.apache.doris.common.Log4jConfig;
 import org.apache.doris.common.ThreadPoolManager;
 import org.apache.doris.common.Version;
 import org.apache.doris.common.util.JdkUtils;
+import org.apache.doris.common.util.NetUtils;
 import org.apache.doris.http.HttpServer;
 import org.apache.doris.journal.bdbje.BDBDebugger;
 import org.apache.doris.journal.bdbje.BDBTool;
@@ -34,6 +35,9 @@ import org.apache.doris.service.ExecuteEnv;
 import org.apache.doris.service.FeServer;
 import org.apache.doris.service.FrontendOptions;
 
+import com.google.common.base.Charsets;
+import com.google.common.base.Strings;
+
 import org.apache.commons.cli.BasicParser;
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
@@ -42,9 +46,6 @@ import org.apache.commons.cli.ParseException;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 
-import com.google.common.base.Charsets;
-import com.google.common.base.Strings;
-
 import java.io.File;
 import java.io.IOException;
 import java.io.RandomAccessFile;
@@ -111,6 +112,9 @@ public class PaloFe {
 
             FrontendOptions.init();
 
+            // check all port
+            checkAllPorts();
+
             if (Config.enable_bdbje_debug_mode) {
                 // Start in BDB Debug mode
                 BDBDebugger.get().startDebugMode(dorisHomeDir);
@@ -129,7 +133,7 @@ public class PaloFe {
             FeServer feServer = new FeServer(Config.rpc_port);
 
             feServer.start();
-
+            
             if (!Config.enable_http_server_v2) {
                 HttpServer httpServer = new HttpServer(
                         Config.http_port,
@@ -162,6 +166,25 @@ public class PaloFe {
         }
     }
 
+    private static void checkAllPorts() throws IOException {
+        if (!NetUtils.isPortAvailable(FrontendOptions.getLocalHostAddress(), Config.edit_log_port,
+                "Edit log port", NetUtils.EDIT_LOG_PORT_SUGGESTION)) {
+            throw new IOException("port " + Config.edit_log_port + " already in use");
+        }
+        if (!NetUtils.isPortAvailable(FrontendOptions.getLocalHostAddress(), Config.http_port,
+                "Http port", NetUtils.HTTP_PORT_SUGGESTION)) {
+            throw new IOException("port " + Config.http_port + " already in use");
+        }
+        if (!NetUtils.isPortAvailable(FrontendOptions.getLocalHostAddress(), Config.query_port,
+                "Query port", NetUtils.QUERY_PORT_SUGGESTION)) {
+            throw new IOException("port " + Config.query_port + " already in use");
+        }
+        if (!NetUtils.isPortAvailable(FrontendOptions.getLocalHostAddress(), Config.rpc_port,
+                "Rpc port", NetUtils.RPC_PORT_SUGGESTION)) {
+            throw new IOException("port " + Config.rpc_port + " already in use");
+        }
+    }
+
     /*
      * -v --version
      *      Print the version of Palo Frontend
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java
index 5e4005a..6a248fc 100755
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java
@@ -287,6 +287,7 @@ import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.stream.Collectors;
+
 import javax.annotation.Nullable;
 
 public class Catalog {
@@ -1456,7 +1457,9 @@ public class Catalog {
 
         Frontend fe = checkFeExist(selfNode.first, selfNode.second);
         if (fe == null) {
-            LOG.error("current node is not added to the cluster, will exit");
+            LOG.error("current node {}:{} is not added to the cluster, will exit." +
+                            " Your FE IP maybe changed, please set 'priority_networks' config in fe.conf properly.",
+                    selfNode.first, selfNode.second);
             System.exit(-1);
         } else if (fe.getRole() != role) {
             LOG.error("current node role is {} not match with frontend recorded role {}. will exit", role,
diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/NetUtils.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/NetUtils.java
index 6e17e2c..bbd47a9 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/util/NetUtils.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/NetUtils.java
@@ -20,9 +20,12 @@ package org.apache.doris.common.util;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 
+import java.io.IOException;
+import java.net.DatagramSocket;
 import java.net.InetAddress;
 import java.net.InetSocketAddress;
 import java.net.NetworkInterface;
+import java.net.ServerSocket;
 import java.net.SocketException;
 import java.net.UnknownHostException;
 import java.util.Enumeration;
@@ -31,6 +34,14 @@ import java.util.List;
 public class NetUtils {
     private static final Logger LOG = LogManager.getLogger(NetUtils.class);
 
+    public static final String EDIT_LOG_PORT_SUGGESTION = "Please change the 'edit_log_port' in fe.conf and try again." +
+            " But if this is not the first time your start this FE, please DO NOT change it. " +
+            " You need to find the service that occupies the port and shut it down, and then return the port to Doris.";
+    public static final String QUERY_PORT_SUGGESTION = "Please change the 'query_port' in fe.conf and try again.";
+    public static final String HTTP_PORT_SUGGESTION = "Please change the 'http_port' in fe.conf and try again. " +
+            "But you need to make sure that ALL FEs http_port are same.";
+    public static final String RPC_PORT_SUGGESTION = "Please change the 'rpc_port' in fe.conf and try again.";
+
     // Target format is "host:port"
     public static InetSocketAddress createSocketAddr(String target) {
         int colonIndex = target.indexOf(':');
@@ -74,4 +85,32 @@ public class NetUtils {
         }
         return hostName;
     }
-}
+
+    // This is the implementation is inspired by Apache camel project:
+    public static boolean isPortAvailable(String host, int port, String portName, String suggestion) {
+        ServerSocket ss = null;
+        DatagramSocket ds = null;
+        try {
+            ss = new ServerSocket(port);
+            ss.setReuseAddress(true);
+            ds = new DatagramSocket(port);
+            ds.setReuseAddress(true);
+            return true;
+        } catch (IOException e) {
+            LOG.warn("{} {} is already in use. {}", portName, port, suggestion, e);
+        } finally {
+            if (ds != null) {
+                ds.close();
+            }
+
+            if (ss != null) {
+                try {
+                    ss.close();
+                } catch (IOException e) {
+                    /* should not be thrown */
+                }
+            }
+        }
+        return false;
+    }
+}
\ No newline at end of file
diff --git a/fe/fe-core/src/main/java/org/apache/doris/journal/bdbje/BDBJEJournal.java b/fe/fe-core/src/main/java/org/apache/doris/journal/bdbje/BDBJEJournal.java
index dae99bc..553aa22 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/journal/bdbje/BDBJEJournal.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/journal/bdbje/BDBJEJournal.java
@@ -46,9 +46,6 @@ import java.io.ByteArrayInputStream;
 import java.io.DataInputStream;
 import java.io.File;
 import java.io.IOException;
-import java.net.InetAddress;
-import java.net.Socket;
-import java.net.UnknownHostException;
 import java.util.List;
 import java.util.concurrent.atomic.AtomicLong;
 
@@ -82,19 +79,10 @@ public class BDBJEJournal implements Journal {
      * node name is ip_port (the port is edit_log_port)
      */
     private void initBDBEnv(String nodeName) {
-        environmentPath = Catalog.getCurrentCatalog().getBdbDir();
-        try {
-            Pair<String, Integer> selfNode = Catalog.getCurrentCatalog().getSelfNode();
-            if (isPortUsing(selfNode.first, selfNode.second)) {
-                LOG.error("edit_log_port {} is already in use. will exit.", selfNode.second);
-                System.exit(-1);
-            }
-            selfNodeName = nodeName;
-            selfNodeHostPort = selfNode.first + ":" + selfNode.second;
-        } catch (IOException e) {
-            LOG.error(e);
-            System.exit(-1);
-        }
+        environmentPath = Catalog.getServingCatalog().getBdbDir();
+        Pair<String, Integer> selfNode = Catalog.getServingCatalog().getSelfNode();
+        selfNodeName = nodeName;
+        selfNodeHostPort = selfNode.first + ":" + selfNode.second;
     }
 
     /*
@@ -308,11 +296,11 @@ public class BDBJEJournal implements Journal {
         if (bdbEnvironment == null) {
             File dbEnv = new File(environmentPath);
             bdbEnvironment = new BDBEnvironment();
-            Pair<String, Integer> helperNode = Catalog.getCurrentCatalog().getHelperNode();
+            Pair<String, Integer> helperNode = Catalog.getServingCatalog().getHelperNode();
             String helperHostPort = helperNode.first + ":" + helperNode.second;
             try {
                 bdbEnvironment.setup(dbEnv, selfNodeName, selfNodeHostPort,
-                                     helperHostPort, Catalog.getCurrentCatalog().isElectable());
+                        helperHostPort, Catalog.getServingCatalog().isElectable());
             } catch (Exception e) {
                 LOG.error("catch an exception when setup bdb environment. will exit.", e);
                 System.exit(-1);
@@ -320,7 +308,6 @@ public class BDBJEJournal implements Journal {
         }
         
         // Open a new journal database or get last existing one as current journal database
-        Pair<String, Integer> helperNode = Catalog.getCurrentCatalog().getHelperNode();
         List<Long> dbNames = null;
         for (int i = 0; i < RETRY_TIME; i++) {
             try {
@@ -333,36 +320,43 @@ public class BDBJEJournal implements Journal {
                 if (dbNames.size() == 0) {
                     /*
                      *  This is the very first time to open. Usually, we will open a new database named "1".
-                     *  But when we start cluster with an image file copied from other cluster, 
+                     *  But when we start cluster with an image file copied from other cluster,
                      *  here we should open database with name image max journal id + 1.
-                     *  (default Catalog.getCurrentCatalog().getReplayedJournalId() is 0)
+                     *  (default Catalog.getServingCatalog().getReplayedJournalId() is 0)
                      */
-                    String dbName = Long.toString(Catalog.getCurrentCatalog().getReplayedJournalId() + 1);
+                    String dbName = Long.toString(Catalog.getServingCatalog().getReplayedJournalId() + 1);
                     LOG.info("the very first time to open bdb, dbname is {}", dbName);
                     currentJournalDB = bdbEnvironment.openDatabase(dbName);
                 } else {
                     // get last database as current journal database
                     currentJournalDB = bdbEnvironment.openDatabase(dbNames.get(dbNames.size() - 1).toString());
                 }
-                
+
                 // set next journal id
                 nextJournalId.set(getMaxJournalId() + 1);
-                
+
                 break;
             } catch (InsufficientLogException insufficientLogEx) {
-                // Copy the missing log files from a member of the replication group who owns the files
-                LOG.warn("catch insufficient log exception. will recover and try again.", insufficientLogEx);
-                NetworkRestore restore = new NetworkRestore();
-                NetworkRestoreConfig config = new NetworkRestoreConfig();
-                config.setRetainLogFiles(false);
-                restore.execute(insufficientLogEx, config);
-                bdbEnvironment.close();
-                bdbEnvironment.setup(new File(environmentPath), selfNodeName, selfNodeHostPort, 
-                                     helperNode.first + ":" + helperNode.second, Catalog.getCurrentCatalog().isElectable());
+                reSetupBdbEnvironment(insufficientLogEx);
             }
         }
     }
-    
+
+    private void reSetupBdbEnvironment(InsufficientLogException insufficientLogEx) {
+        LOG.warn("catch insufficient log exception. will recover and try again.", insufficientLogEx);
+        // Copy the missing log files from a member of the replication group who owns the files
+        // ATTN: here we use `getServingCatalog()`, because only serving catalog has helper nodes.
+        Pair<String, Integer> helperNode = Catalog.getServingCatalog().getHelperNode();
+        NetworkRestore restore = new NetworkRestore();
+        NetworkRestoreConfig config = new NetworkRestoreConfig();
+        config.setRetainLogFiles(false);
+        restore.execute(insufficientLogEx, config);
+        bdbEnvironment.close();
+        bdbEnvironment.setup(new File(environmentPath), selfNodeName, selfNodeHostPort,
+                helperNode.first + ":" + helperNode.second,
+                Catalog.getServingCatalog().isElectable());
+    }
+
     @Override
     public void deleteJournals(long deleteToJournalId) {
         List<Long> dbNames = getDatabaseNames();
@@ -370,7 +364,7 @@ public class BDBJEJournal implements Journal {
             LOG.info("delete database names is null.");
             return;
         }
-        
+
         String msg = "existing database names: ";
         for (long name : dbNames) {
             msg += name + " ";
@@ -420,41 +414,36 @@ public class BDBJEJournal implements Journal {
         }
 
         // Open a new journal database or get last existing one as current journal database
-        Pair<String, Integer> helperNode = Catalog.getCurrentCatalog().getHelperNode();
         List<Long>  dbNames = null;
         for (int i = 0; i < RETRY_TIME; i++) {
             try {
                 dbNames = bdbEnvironment.getDatabaseNames();
+                break;
             } catch (InsufficientLogException insufficientLogEx) {
-                // Copy the missing log files from a member of the replication group who owns the files
-                LOG.warn("catch insufficient log exception. will recover and try again.", insufficientLogEx);
-                NetworkRestore restore = new NetworkRestore();
-                NetworkRestoreConfig config = new NetworkRestoreConfig();
-                config.setRetainLogFiles(false);
-                restore.execute(insufficientLogEx, config);
-                bdbEnvironment.close();
-                bdbEnvironment.setup(new File(environmentPath), selfNodeName, selfNodeHostPort,
-                        helperNode.first + ":" + helperNode.second, Catalog.getCurrentCatalog().isElectable());
+                /*
+                 * If this is not a checkpoint thread, which means this maybe the FE startup thread,
+                 * or a replay thread. We will reopen bdbEnvironment for these 2 cases to get valid log
+                 * from helper nodes.
+                 *
+                 * The checkpoint thread will only run on Master FE. And Master FE should not encounter
+                 * these exception. So if it happens, throw exception out.
+                 */
+                if (!Catalog.isCheckpointThread()) {
+                    reSetupBdbEnvironment(insufficientLogEx);
+                } else {
+                    throw insufficientLogEx;
+                }
             } catch (RollbackException rollbackEx) {
-                LOG.warn("catch rollback log exception. will reopen the ReplicatedEnvironment.", rollbackEx);
-                bdbEnvironment.closeReplicatedEnvironment();
-                bdbEnvironment.openReplicatedEnvironment(new File(environmentPath));
+                if (!Catalog.isCheckpointThread()) {
+                    LOG.warn("catch rollback log exception. will reopen the ReplicatedEnvironment.", rollbackEx);
+                    bdbEnvironment.closeReplicatedEnvironment();
+                    bdbEnvironment.openReplicatedEnvironment(new File(environmentPath));
+                } else {
+                    throw rollbackEx;
+                }
             }
         }
 
         return dbNames;
     }
-    
-    public boolean isPortUsing(String host, int port) throws UnknownHostException {  
-        boolean flag = false;  
-        InetAddress theAddress = InetAddress.getByName(host);
-        try {  
-            Socket socket = new Socket(theAddress, port);
-            flag = true;
-            socket.close();
-        } catch (IOException e) {
-            // do nothing
-        }  
-        return flag;  
-    }
 }
\ No newline at end of file
diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java
index 6a90dab..56e30b7 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java
@@ -83,7 +83,7 @@ public class Checkpoint extends MasterDaemon {
             if (imageVersion >= checkPointVersion) {
                 return;
             }
-        } catch (IOException e) {
+        } catch (Throwable e) {
             LOG.error("Does not get storage info", e);
             if (MetricRepo.isInit) {
                 MetricRepo.COUNTER_IMAGE_WRITE_FAILED.increase(1L);
@@ -117,7 +117,7 @@ public class Checkpoint extends MasterDaemon {
                 MetricRepo.COUNTER_IMAGE_WRITE_SUCCESS.increase(1L);
             }
             LOG.info("checkpoint finished save image.{}", replayedJournalId);
-        } catch (Exception e) {
+        } catch (Throwable e) {
             e.printStackTrace();
             LOG.error("Exception when generate new image file", e);
             if (MetricRepo.isInit) {
@@ -127,7 +127,7 @@ public class Checkpoint extends MasterDaemon {
         } finally {
             // destroy checkpoint catalog, reclaim memory
             catalog = null;
-            Catalog.destroyCheckpoint(); 
+            Catalog.destroyCheckpoint();
         }
         
         // push image file to all the other non master nodes
@@ -202,7 +202,7 @@ public class Checkpoint extends MasterDaemon {
                             if (minOtherNodesJournalId > id) {
                                 minOtherNodesJournalId = id;
                             }
-                        } catch (IOException e) {
+                        } catch (Throwable e) {
                             throw new CheckpointException(String.format("Exception when getting current replayed journal id. host=%s, port=%d",
                                     host, port), e);
                         } finally {
@@ -235,7 +235,7 @@ public class Checkpoint extends MasterDaemon {
             if (MetricRepo.isInit) {
                 MetricRepo.COUNTER_IMAGE_CLEAN_SUCCESS.increase(1L);
             }
-        } catch (IOException e) {
+        } catch (Throwable e) {
             LOG.error("Master delete old image file fail.", e);
             if (MetricRepo.isInit) {
                 MetricRepo.COUNTER_IMAGE_CLEAN_FAILED.increase(1L);

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org