You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by ji...@apache.org on 2022/12/04 01:14:21 UTC

[doris] branch branch-1.1-lts updated: [branch-1.1](log) add tracing log for publish task (#14786)

This is an automated email from the ASF dual-hosted git repository.

jiafengzheng pushed a commit to branch branch-1.1-lts
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-1.1-lts by this push:
     new 6c9caec6fd [branch-1.1](log) add tracing log for publish task (#14786)
6c9caec6fd is described below

commit 6c9caec6fd325652a647c070d8ae78e5262744f0
Author: Mingyu Chen <mo...@163.com>
AuthorDate: Sun Dec 4 09:14:16 2022 +0800

    [branch-1.1](log) add tracing log for publish task (#14786)
---
 be/src/agent/task_worker_pool.cpp                |  2 --
 be/src/common/config.h                           |  3 ++-
 be/src/olap/data_dir.h                           |  5 +++++
 be/src/olap/task/engine_publish_version_task.cpp | 23 +++++++++++++++++++----
 be/src/olap/txn_manager.cpp                      | 12 +++++++++---
 be/src/olap/txn_manager.h                        |  4 ++--
 be/test/olap/delta_writer_test.cpp               |  6 ++++--
 be/test/olap/txn_manager_test.cpp                |  6 ++++--
 8 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp
index cb78c3beb6..41838f788d 100644
--- a/be/src/agent/task_worker_pool.cpp
+++ b/be/src/agent/task_worker_pool.cpp
@@ -729,14 +729,12 @@ void TaskWorkerPool::_publish_version_worker_thread_callback() {
             st = Status::RuntimeError(strings::Substitute("publish version failed. error=$0", res));
             finish_task_request.__set_error_tablet_ids(error_tablet_ids);
         } else {
-            int submit_tablets = 0;
             if (config::enable_quick_compaction && config::quick_compaction_batch_size > 0) {
                 for (auto& entry : succ_tablet_ids) {
                     TabletSharedPtr tablet =
                             StorageEngine::instance()->tablet_manager()->get_tablet(
                                     entry.first, entry.second);
                     if (tablet != nullptr) {
-                        submit_tablets++;
                         tablet->publised_count++;
                         if (tablet->publised_count % config::quick_compaction_batch_size == 0) {
                             StorageEngine::instance()->submit_quick_compaction_task(tablet);
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 68c5e6cd7e..d3b45ba109 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -308,6 +308,7 @@ CONF_mInt64(row_step_for_compaction_merge_log, "0");
 // Threshold to logging compaction trace, in seconds.
 CONF_mInt32(base_compaction_trace_threshold, "60");
 CONF_mInt32(cumulative_compaction_trace_threshold, "10");
+CONF_mInt32(publish_task_trace_threshold, "2");
 CONF_mBool(disable_compaction_trace_log, "true");
 
 // Threshold to logging agent task trace, in seconds.
@@ -553,7 +554,7 @@ CONF_mInt64(max_runnings_transactions_per_txn_map, "100");
 
 // tablet_map_lock shard size, the value is 2^n, n=0,1,2,3,4
 // this is a an enhancement for better performance to manage tablet
-CONF_Int32(tablet_map_shard_size, "1");
+CONF_Int32(tablet_map_shard_size, "4");
 
 // txn_map_lock shard size, the value is 2^n, n=0,1,2,3,4
 // this is a an enhancement for better performance to manage txn
diff --git a/be/src/olap/data_dir.h b/be/src/olap/data_dir.h
index 68933167c2..3b4a41799f 100644
--- a/be/src/olap/data_dir.h
+++ b/be/src/olap/data_dir.h
@@ -40,6 +40,11 @@ class TabletManager;
 class TabletMeta;
 class TxnManager;
 
+struct PublishStatistic {
+    size_t get_lock_time = 0;
+    size_t save_meta_time = 0;
+};
+
 // A DataDir used to manage data in same path.
 // Now, After DataDir was created, it will never be deleted for easy implementation.
 class DataDir {
diff --git a/be/src/olap/task/engine_publish_version_task.cpp b/be/src/olap/task/engine_publish_version_task.cpp
index 1abace542a..8d9aff21dc 100644
--- a/be/src/olap/task/engine_publish_version_task.cpp
+++ b/be/src/olap/task/engine_publish_version_task.cpp
@@ -22,6 +22,7 @@
 #include "olap/data_dir.h"
 #include "olap/rowset/rowset_meta_manager.h"
 #include "olap/tablet_manager.h"
+#include "util/scoped_cleanup.h"
 
 namespace doris {
 
@@ -37,9 +38,20 @@ EnginePublishVersionTask::EnginePublishVersionTask(TPublishVersionRequest& publi
 OLAPStatus EnginePublishVersionTask::finish() {
     OLAPStatus res = OLAP_SUCCESS;
     int64_t transaction_id = _publish_version_req.transaction_id;
-    OlapStopWatch watch;
+    // OlapStopWatch watch;
     VLOG_NOTICE << "begin to process publish version. transaction_id=" << transaction_id;
 
+    PublishStatistic stat;
+    MonotonicStopWatch watch;
+    watch.start();
+    SCOPED_CLEANUP({
+            if (watch.elapsed_time() / 1e9 > config::publish_task_trace_threshold) {
+            LOG(WARNING) << "Trace " << transaction_id << " publish stat(ns): get lock time: "
+                         << stat.get_lock_time << ", save meta time: " << stat.save_meta_time;
+            }
+            });
+
+    size_t tablet_num = 0;
     // each partition
     for (auto& par_ver_info : _publish_version_req.partition_version_infos) {
         int64_t partition_id = par_ver_info.partition_id;
@@ -59,6 +71,7 @@ OLAPStatus EnginePublishVersionTask::finish() {
 
         Version version(par_ver_info.version, par_ver_info.version);
 
+        tablet_num += tablet_related_rs.size();
         // each tablet
         for (auto& tablet_rs : tablet_related_rs) {
             OLAPStatus publish_status = OLAP_SUCCESS;
@@ -90,7 +103,7 @@ OLAPStatus EnginePublishVersionTask::finish() {
             }
 
             publish_status = StorageEngine::instance()->txn_manager()->publish_txn(
-                    partition_id, tablet, transaction_id, version);
+                    partition_id, tablet, transaction_id, version, &stat);
             if (publish_status != OLAP_SUCCESS) {
                 LOG(WARNING) << "failed to publish version. rowset_id=" << rowset->rowset_id()
                              << ", tablet_id=" << tablet_info.tablet_id
@@ -155,8 +168,10 @@ OLAPStatus EnginePublishVersionTask::finish() {
     }
 
     LOG(INFO) << "finish to publish version on transaction."
-              << "transaction_id=" << transaction_id << ", cost(us): " << watch.get_elapse_time_us()
-              << ", error_tablet_size=" << _error_tablet_ids->size();
+              << "transaction_id=" << transaction_id << ", cost(us): " << watch.elapsed_time() / 1000
+              << ", error_tablet_size=" << _error_tablet_ids->size()
+              << ", partition num: " << _publish_version_req.partition_version_infos.size()
+              << ", tablet num: " << tablet_num;
     return res;
 }
 
diff --git a/be/src/olap/txn_manager.cpp b/be/src/olap/txn_manager.cpp
index 799aaffbda..31079c292b 100644
--- a/be/src/olap/txn_manager.cpp
+++ b/be/src/olap/txn_manager.cpp
@@ -97,9 +97,10 @@ OLAPStatus TxnManager::commit_txn(TPartitionId partition_id, const TabletSharedP
 }
 
 OLAPStatus TxnManager::publish_txn(TPartitionId partition_id, const TabletSharedPtr& tablet,
-                                   TTransactionId transaction_id, const Version& version) {
+                                   TTransactionId transaction_id, const Version& version,
+                                   PublishStatistic* stat) {
     return publish_txn(tablet->data_dir()->get_meta(), partition_id, transaction_id,
-                       tablet->tablet_id(), tablet->schema_hash(), tablet->tablet_uid(), version);
+                       tablet->tablet_id(), tablet->schema_hash(), tablet->tablet_uid(), version, stat);
 }
 
 // delete the txn from manager if it is not committed(not have a valid rowset)
@@ -257,13 +258,16 @@ OLAPStatus TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id,
 OLAPStatus TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id,
                                    TTransactionId transaction_id, TTabletId tablet_id,
                                    SchemaHash schema_hash, TabletUid tablet_uid,
-                                   const Version& version) {
+                                   const Version& version, PublishStatistic* stat) {
+    MonotonicStopWatch watch;
+    watch.start();
     pair<int64_t, int64_t> key(partition_id, transaction_id);
     TabletInfo tablet_info(tablet_id, schema_hash, tablet_uid);
     RowsetSharedPtr rowset_ptr = nullptr;
     MutexLock txn_lock(&_get_txn_lock(transaction_id));
     {
         ReadLock rlock(_get_txn_map_lock(transaction_id));
+        stat->get_lock_time += watch.elapsed_time();
         txn_tablet_map_t& txn_tablet_map = _get_txn_tablet_map(transaction_id);
         auto it = txn_tablet_map.find(key);
         if (it != txn_tablet_map.end()) {
@@ -281,6 +285,7 @@ OLAPStatus TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id,
     if (rowset_ptr != nullptr) {
         // TODO(ygl): rowset is already set version here, memory is changed, if save failed
         // it maybe a fatal error
+        watch.reset();
         rowset_ptr->make_visible(version);
         OLAPStatus save_status =
                 RowsetMetaManager::save(meta, tablet_uid, rowset_ptr->rowset_id(),
@@ -291,6 +296,7 @@ OLAPStatus TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id,
                          << ", txn id:" << transaction_id;
             return OLAP_ERR_ROWSET_SAVE_FAILED;
         }
+        stat->save_meta_time += watch.elapsed_time();
     } else {
         return OLAP_ERR_TRANSACTION_NOT_EXIST;
     }
diff --git a/be/src/olap/txn_manager.h b/be/src/olap/txn_manager.h
index 24ab6fb7aa..ef4665ea86 100644
--- a/be/src/olap/txn_manager.h
+++ b/be/src/olap/txn_manager.h
@@ -79,7 +79,7 @@ public:
                           const RowsetSharedPtr& rowset_ptr, bool is_recovery);
 
     OLAPStatus publish_txn(TPartitionId partition_id, const TabletSharedPtr& tablet,
-                           TTransactionId transaction_id, const Version& version);
+                           TTransactionId transaction_id, const Version& version, PublishStatistic* stat = nullptr);
 
     // delete the txn from manager if it is not committed(not have a valid rowset)
     OLAPStatus rollback_txn(TPartitionId partition_id, const TabletSharedPtr& tablet,
@@ -103,7 +103,7 @@ public:
     // not persist rowset meta because
     OLAPStatus publish_txn(OlapMeta* meta, TPartitionId partition_id, TTransactionId transaction_id,
                            TTabletId tablet_id, SchemaHash schema_hash, TabletUid tablet_uid,
-                           const Version& version);
+                           const Version& version, PublishStatistic* stat = nullptr);
 
     // delete the txn from manager if it is not committed(not have a valid rowset)
     OLAPStatus rollback_txn(TPartitionId partition_id, TTransactionId transaction_id,
diff --git a/be/test/olap/delta_writer_test.cpp b/be/test/olap/delta_writer_test.cpp
index f4fd29d44b..34a38655ed 100644
--- a/be/test/olap/delta_writer_test.cpp
+++ b/be/test/olap/delta_writer_test.cpp
@@ -488,11 +488,12 @@ TEST_F(TestDeltaWriter, write) {
     std::map<TabletInfo, RowsetSharedPtr> tablet_related_rs;
     StorageEngine::instance()->txn_manager()->get_txn_related_tablets(
             write_req.txn_id, write_req.partition_id, &tablet_related_rs);
+    PublishStatistic stat;
     for (auto& tablet_rs : tablet_related_rs) {
         RowsetSharedPtr rowset = tablet_rs.second;
         res = k_engine->txn_manager()->publish_txn(meta, write_req.partition_id, write_req.txn_id,
                                                    write_req.tablet_id, write_req.schema_hash,
-                                                   tablet_rs.first.tablet_uid, version);
+                                                   tablet_rs.first.tablet_uid, version, &stat);
         EXPECT_EQ(OLAP_SUCCESS, res);
         res = tablet->add_inc_rowset(rowset);
         EXPECT_EQ(OLAP_SUCCESS, res);
@@ -559,11 +560,12 @@ TEST_F(TestDeltaWriter, sequence_col) {
     std::map<TabletInfo, RowsetSharedPtr> tablet_related_rs;
     StorageEngine::instance()->txn_manager()->get_txn_related_tablets(
             write_req.txn_id, write_req.partition_id, &tablet_related_rs);
+    PublishStatistic stat;
     for (auto& tablet_rs : tablet_related_rs) {
         RowsetSharedPtr rowset = tablet_rs.second;
         res = k_engine->txn_manager()->publish_txn(meta, write_req.partition_id, write_req.txn_id,
                                                    write_req.tablet_id, write_req.schema_hash,
-                                                   tablet_rs.first.tablet_uid, version);
+                                                   tablet_rs.first.tablet_uid, version, &stat);
         EXPECT_EQ(OLAP_SUCCESS, res);
         res = tablet->add_inc_rowset(rowset);
         EXPECT_EQ(OLAP_SUCCESS, res);
diff --git a/be/test/olap/txn_manager_test.cpp b/be/test/olap/txn_manager_test.cpp
index a3213da90d..a9220b27d0 100644
--- a/be/test/olap/txn_manager_test.cpp
+++ b/be/test/olap/txn_manager_test.cpp
@@ -283,8 +283,9 @@ TEST_F(TxnManagerTest, PublishVersionSuccessful) {
                                  _tablet_uid, load_id, _alpha_rowset, false);
     EXPECT_TRUE(status == OLAP_SUCCESS);
     Version new_version(10, 11);
+    PublishStatistic stat;
     status = _txn_mgr->publish_txn(_meta, partition_id, transaction_id, tablet_id, schema_hash,
-                                   _tablet_uid, new_version);
+                                   _tablet_uid, new_version, &stat);
     EXPECT_TRUE(status == OLAP_SUCCESS);
 
     RowsetMetaSharedPtr rowset_meta(new AlphaRowsetMeta());
@@ -299,8 +300,9 @@ TEST_F(TxnManagerTest, PublishVersionSuccessful) {
 // 1. publish version failed if not found related txn and rowset
 TEST_F(TxnManagerTest, PublishNotExistedTxn) {
     Version new_version(10, 11);
+    PublishStatistic stat;
     OLAPStatus status = _txn_mgr->publish_txn(_meta, partition_id, transaction_id, tablet_id,
-                                              schema_hash, _tablet_uid, new_version);
+                                              schema_hash, _tablet_uid, new_version, &stat);
     EXPECT_TRUE(status != OLAP_SUCCESS);
 }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org