You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2020/12/19 03:18:22 UTC
[incubator-doris] branch master updated: [Trace] Add trace for
create tablet tasks (#5091)
This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new 176dcf8 [Trace] Add trace for create tablet tasks (#5091)
176dcf8 is described below
commit 176dcf8bd9431d55e11a0dad4d3d3a07d74735d0
Author: Yingchun Lai <40...@qq.com>
AuthorDate: Sat Dec 19 11:18:12 2020 +0800
[Trace] Add trace for create tablet tasks (#5091)
Add trace for create tablet tasks, it's a useful tool for admin to find
out the bottleneck when create tablets timeouted.
For example, admin could enlarge 'tablet_map_shard_size' when found
'got tablets shard lock' procedure cost too much time.
---
be/src/agent/task_worker_pool.cpp | 16 ++++++++++++++++
be/src/common/config.h | 3 +++
be/src/olap/storage_engine.cpp | 1 +
be/src/olap/tablet_manager.cpp | 9 +++++++++
4 files changed, 29 insertions(+)
diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp
index a408748..0d10314 100644
--- a/be/src/agent/task_worker_pool.cpp
+++ b/be/src/agent/task_worker_pool.cpp
@@ -51,8 +51,10 @@
#include "util/doris_metrics.h"
#include "util/file_utils.h"
#include "util/monotime.h"
+#include "util/scoped_cleanup.h"
#include "util/stopwatch.hpp"
#include "util/threadpool.h"
+#include "util/trace.h"
using std::deque;
using std::list;
@@ -257,6 +259,7 @@ void TaskWorkerPool::_remove_task_info(const TTaskType::type task_type, int64_t
EnumToString(TTaskType, task_type, type_str);
LOG(INFO) << "remove task info. type=" << type_str << ", signature=" << signature
<< ", queue_size=" << queue_size;
+ TRACE("remove task info");
}
void TaskWorkerPool::_finish_task(const TFinishTaskRequest& finish_task_request) {
@@ -278,6 +281,7 @@ void TaskWorkerPool::_finish_task(const TFinishTaskRequest& finish_task_request)
}
sleep(config::sleep_one_second);
}
+ TRACE("finish task");
}
uint32_t TaskWorkerPool::_get_next_task_index(int32_t thread_count,
@@ -324,6 +328,17 @@ void TaskWorkerPool::_create_tablet_worker_thread_callback() {
_tasks.pop_front();
}
+ scoped_refptr<Trace> trace(new Trace);
+ MonotonicStopWatch watch;
+ watch.start();
+ SCOPED_CLEANUP({
+ if (watch.elapsed_time() / 1e9 > config::agent_task_trace_threshold_sec) {
+ LOG(WARNING) << "Trace:" << std::endl << trace->DumpToString(Trace::INCLUDE_ALL);
+ }
+ });
+ ADOPT_TRACE(trace.get());
+ TRACE("start to create tablet $0", create_tablet_req.tablet_id);
+
TStatusCode::type status_code = TStatusCode::OK;
std::vector<string> error_msgs;
TStatus task_status;
@@ -351,6 +366,7 @@ void TaskWorkerPool::_create_tablet_worker_thread_callback() {
tablet_info.__set_path_hash(tablet->data_dir()->path_hash());
finish_tablet_infos.push_back(tablet_info);
}
+ TRACE("StorageEngine create tablet finish, status: $0", create_status);
task_status.__set_status_code(status_code);
task_status.__set_error_msgs(error_msgs);
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 75ed1eb..93f6bfe 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -331,6 +331,9 @@ CONF_mInt64(row_step_for_compaction_merge_log, "0");
CONF_mInt32(base_compaction_trace_threshold, "10");
CONF_mInt32(cumulative_compaction_trace_threshold, "2");
+// Threshold to logging agent task trace, in seconds.
+CONF_mInt32(agent_task_trace_threshold_sec, "2");
+
// time interval to record tablet scan count in second for the purpose of calculating tablet scan frequency
CONF_mInt64(tablet_scan_frequency_time_node_interval_second, "300");
// coefficient for tablet scan frequency and compaction score when finding a tablet for compaction
diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp
index 56f2d90..0147443 100644
--- a/be/src/olap/storage_engine.cpp
+++ b/be/src/olap/storage_engine.cpp
@@ -806,6 +806,7 @@ OLAPStatus StorageEngine::create_tablet(const TCreateTabletReq& request) {
LOG(WARNING) << "there is no available disk that can be used to create tablet.";
return OLAP_ERR_CE_CMD_PARAMS_ERROR;
}
+ TRACE("got data directory for create tablet");
return _tablet_manager->create_tablet(request, stores);
}
diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp
index 1fd47c6..1567698 100644
--- a/be/src/olap/tablet_manager.cpp
+++ b/be/src/olap/tablet_manager.cpp
@@ -51,6 +51,7 @@
#include "util/pretty_printer.h"
#include "util/scoped_cleanup.h"
#include "util/time.h"
+#include "util/trace.h"
using std::list;
using std::map;
@@ -211,6 +212,7 @@ OLAPStatus TabletManager::create_tablet(const TCreateTabletReq& request,
<< ", schema_hash=" << schema_hash;
WriteLock wlock(_get_tablets_shard_lock(tablet_id));
+ TRACE("got tablets shard lock");
// Make create_tablet operation to be idempotent:
// 1. Return true if tablet with same tablet_id and schema_hash exist;
// false if tablet with same tablet_id but different schema_hash exist.
@@ -251,6 +253,7 @@ OLAPStatus TabletManager::create_tablet(const TCreateTabletReq& request,
stores.clear();
stores.push_back(base_tablet->data_dir());
}
+ TRACE("got base tablet");
// set alter type to schema-change. it is useless
TabletSharedPtr tablet = _internal_create_tablet_unlocked(
@@ -260,6 +263,7 @@ OLAPStatus TabletManager::create_tablet(const TCreateTabletReq& request,
DorisMetrics::instance()->create_tablet_requests_failed->increment(1);
return OLAP_ERR_CE_CMD_PARAMS_ERROR;
}
+ TRACE("succeed to create tablet");
LOG(INFO) << "success to create tablet. tablet_id=" << tablet_id
<< ", schema_hash=" << schema_hash;
@@ -282,6 +286,7 @@ TabletSharedPtr TabletManager::_internal_create_tablet_unlocked(
if (tablet == nullptr) {
return nullptr;
}
+ TRACE("create tablet meta");
int64_t new_tablet_id = request.tablet_id;
int32_t new_schema_hash = request.tablet_schema.schema_hash;
@@ -314,6 +319,7 @@ TabletSharedPtr TabletManager::_internal_create_tablet_unlocked(
LOG(WARNING) << "fail to create initial version for tablet. res=" << res;
break;
}
+ TRACE("create initial rowset");
}
if (is_schema_change) {
if (request.__isset.base_tablet_id && request.base_tablet_id > 0) {
@@ -341,6 +347,7 @@ TabletSharedPtr TabletManager::_internal_create_tablet_unlocked(
int64_t new_creation_time = base_tablet->creation_time() + 1;
tablet->set_creation_time(new_creation_time);
}
+ TRACE("update schema change info");
}
// Add tablet to StorageEngine will make it visible to user
res = _add_tablet_unlocked(new_tablet_id, new_schema_hash, tablet, true, false);
@@ -358,6 +365,7 @@ TabletSharedPtr TabletManager::_internal_create_tablet_unlocked(
LOG(WARNING) << "fail to get tablet. res=" << res;
break;
}
+ TRACE("add tablet to StorageEngine");
} while (0);
if (res == OLAP_SUCCESS) {
@@ -373,6 +381,7 @@ TabletSharedPtr TabletManager::_internal_create_tablet_unlocked(
tablet->delete_all_files();
TabletMetaManager::remove(data_dir, new_tablet_id, new_schema_hash);
}
+ TRACE("revert changes on error");
return nullptr;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org