You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2020/12/19 03:18:22 UTC

[incubator-doris] branch master updated: [Trace] Add trace for create tablet tasks (#5091)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 176dcf8  [Trace] Add trace for create tablet tasks (#5091)
176dcf8 is described below

commit 176dcf8bd9431d55e11a0dad4d3d3a07d74735d0
Author: Yingchun Lai <40...@qq.com>
AuthorDate: Sat Dec 19 11:18:12 2020 +0800

    [Trace] Add trace for create tablet tasks (#5091)
    
    Add trace for create tablet tasks, it's a useful tool for admin to find
    out the bottleneck when create tablets timeouted.
    For example, admin could enlarge 'tablet_map_shard_size' when found
    'got tablets shard lock' procedure cost too much time.
---
 be/src/agent/task_worker_pool.cpp | 16 ++++++++++++++++
 be/src/common/config.h            |  3 +++
 be/src/olap/storage_engine.cpp    |  1 +
 be/src/olap/tablet_manager.cpp    |  9 +++++++++
 4 files changed, 29 insertions(+)

diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp
index a408748..0d10314 100644
--- a/be/src/agent/task_worker_pool.cpp
+++ b/be/src/agent/task_worker_pool.cpp
@@ -51,8 +51,10 @@
 #include "util/doris_metrics.h"
 #include "util/file_utils.h"
 #include "util/monotime.h"
+#include "util/scoped_cleanup.h"
 #include "util/stopwatch.hpp"
 #include "util/threadpool.h"
+#include "util/trace.h"
 
 using std::deque;
 using std::list;
@@ -257,6 +259,7 @@ void TaskWorkerPool::_remove_task_info(const TTaskType::type task_type, int64_t
     EnumToString(TTaskType, task_type, type_str);
     LOG(INFO) << "remove task info. type=" << type_str << ", signature=" << signature
               << ", queue_size=" << queue_size;
+    TRACE("remove task info");
 }
 
 void TaskWorkerPool::_finish_task(const TFinishTaskRequest& finish_task_request) {
@@ -278,6 +281,7 @@ void TaskWorkerPool::_finish_task(const TFinishTaskRequest& finish_task_request)
         }
         sleep(config::sleep_one_second);
     }
+    TRACE("finish task");
 }
 
 uint32_t TaskWorkerPool::_get_next_task_index(int32_t thread_count,
@@ -324,6 +328,17 @@ void TaskWorkerPool::_create_tablet_worker_thread_callback() {
             _tasks.pop_front();
         }
 
+        scoped_refptr<Trace> trace(new Trace);
+        MonotonicStopWatch watch;
+        watch.start();
+        SCOPED_CLEANUP({
+            if (watch.elapsed_time() / 1e9 > config::agent_task_trace_threshold_sec) {
+                LOG(WARNING) << "Trace:" << std::endl << trace->DumpToString(Trace::INCLUDE_ALL);
+            }
+        });
+        ADOPT_TRACE(trace.get());
+        TRACE("start to create tablet $0", create_tablet_req.tablet_id);
+
         TStatusCode::type status_code = TStatusCode::OK;
         std::vector<string> error_msgs;
         TStatus task_status;
@@ -351,6 +366,7 @@ void TaskWorkerPool::_create_tablet_worker_thread_callback() {
             tablet_info.__set_path_hash(tablet->data_dir()->path_hash());
             finish_tablet_infos.push_back(tablet_info);
         }
+        TRACE("StorageEngine create tablet finish, status: $0", create_status);
 
         task_status.__set_status_code(status_code);
         task_status.__set_error_msgs(error_msgs);
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 75ed1eb..93f6bfe 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -331,6 +331,9 @@ CONF_mInt64(row_step_for_compaction_merge_log, "0");
 CONF_mInt32(base_compaction_trace_threshold, "10");
 CONF_mInt32(cumulative_compaction_trace_threshold, "2");
 
+// Threshold to logging agent task trace, in seconds.
+CONF_mInt32(agent_task_trace_threshold_sec, "2");
+
 // time interval to record tablet scan count in second for the purpose of calculating tablet scan frequency
 CONF_mInt64(tablet_scan_frequency_time_node_interval_second, "300");
 // coefficient for tablet scan frequency and compaction score when finding a tablet for compaction
diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp
index 56f2d90..0147443 100644
--- a/be/src/olap/storage_engine.cpp
+++ b/be/src/olap/storage_engine.cpp
@@ -806,6 +806,7 @@ OLAPStatus StorageEngine::create_tablet(const TCreateTabletReq& request) {
         LOG(WARNING) << "there is no available disk that can be used to create tablet.";
         return OLAP_ERR_CE_CMD_PARAMS_ERROR;
     }
+    TRACE("got data directory for create tablet");
     return _tablet_manager->create_tablet(request, stores);
 }
 
diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp
index 1fd47c6..1567698 100644
--- a/be/src/olap/tablet_manager.cpp
+++ b/be/src/olap/tablet_manager.cpp
@@ -51,6 +51,7 @@
 #include "util/pretty_printer.h"
 #include "util/scoped_cleanup.h"
 #include "util/time.h"
+#include "util/trace.h"
 
 using std::list;
 using std::map;
@@ -211,6 +212,7 @@ OLAPStatus TabletManager::create_tablet(const TCreateTabletReq& request,
               << ", schema_hash=" << schema_hash;
 
     WriteLock wlock(_get_tablets_shard_lock(tablet_id));
+    TRACE("got tablets shard lock");
     // Make create_tablet operation to be idempotent:
     // 1. Return true if tablet with same tablet_id and schema_hash exist;
     //           false if tablet with same tablet_id but different schema_hash exist.
@@ -251,6 +253,7 @@ OLAPStatus TabletManager::create_tablet(const TCreateTabletReq& request,
         stores.clear();
         stores.push_back(base_tablet->data_dir());
     }
+    TRACE("got base tablet");
 
     // set alter type to schema-change. it is useless
     TabletSharedPtr tablet = _internal_create_tablet_unlocked(
@@ -260,6 +263,7 @@ OLAPStatus TabletManager::create_tablet(const TCreateTabletReq& request,
         DorisMetrics::instance()->create_tablet_requests_failed->increment(1);
         return OLAP_ERR_CE_CMD_PARAMS_ERROR;
     }
+    TRACE("succeed to create tablet");
 
     LOG(INFO) << "success to create tablet. tablet_id=" << tablet_id
               << ", schema_hash=" << schema_hash;
@@ -282,6 +286,7 @@ TabletSharedPtr TabletManager::_internal_create_tablet_unlocked(
     if (tablet == nullptr) {
         return nullptr;
     }
+    TRACE("create tablet meta");
 
     int64_t new_tablet_id = request.tablet_id;
     int32_t new_schema_hash = request.tablet_schema.schema_hash;
@@ -314,6 +319,7 @@ TabletSharedPtr TabletManager::_internal_create_tablet_unlocked(
                 LOG(WARNING) << "fail to create initial version for tablet. res=" << res;
                 break;
             }
+            TRACE("create initial rowset");
         }
         if (is_schema_change) {
             if (request.__isset.base_tablet_id && request.base_tablet_id > 0) {
@@ -341,6 +347,7 @@ TabletSharedPtr TabletManager::_internal_create_tablet_unlocked(
                 int64_t new_creation_time = base_tablet->creation_time() + 1;
                 tablet->set_creation_time(new_creation_time);
             }
+            TRACE("update schema change info");
         }
         // Add tablet to StorageEngine will make it visible to user
         res = _add_tablet_unlocked(new_tablet_id, new_schema_hash, tablet, true, false);
@@ -358,6 +365,7 @@ TabletSharedPtr TabletManager::_internal_create_tablet_unlocked(
             LOG(WARNING) << "fail to get tablet. res=" << res;
             break;
         }
+        TRACE("add tablet to StorageEngine");
     } while (0);
 
     if (res == OLAP_SUCCESS) {
@@ -373,6 +381,7 @@ TabletSharedPtr TabletManager::_internal_create_tablet_unlocked(
         tablet->delete_all_files();
         TabletMetaManager::remove(data_dir, new_tablet_id, new_schema_hash);
     }
+    TRACE("revert changes on error");
     return nullptr;
 }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org