You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2022/05/16 17:11:03 UTC
[kudu] branch master updated: [test] fix ASAN test failures in master-stress-test
This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new e2f0d8fb0 [test] fix ASAN test failures in master-stress-test
e2f0d8fb0 is described below
commit e2f0d8fb056d9834199ec2feab8888a30daefae7
Author: zhangyifan27 <ch...@163.com>
AuthorDate: Thu May 5 20:30:10 2022 +0800
[test] fix ASAN test failures in master-stress-test
The MasterStressTest occasionally failed in ASAN builds [1] [2]. I also built
the test in ASAN mode on a CentOS 7.9 machine, ran it in slow mode 3 times and
all failed.
It seems that some RetryingTSRpcTask is still running after 'TableInfo' object
destroyed if we set --enable_metadata_cleanup_for_deleted_tables_and_tablets=true.
To fix heap-use-after-free errors, we should abort and wait all pending tasks
completed before we destroy a TabletInfo object. With this fix the ASAN test
can pass 10 times.
[1] http://jenkins.kudu.apache.org/job/kudu-gerrit/25414/BUILD_TYPE=ASAN/
[2] http://jenkins.kudu.apache.org/job/kudu-gerrit/25448/BUILD_TYPE=ASAN/
Change-Id: I4eff45fbf05644362169485cd32678509eab1b07
Reviewed-on: http://gerrit.cloudera.org:8080/18501
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <al...@apache.org>
---
src/kudu/master/catalog_manager.cc | 49 ++++++++++++++++++++++----------------
1 file changed, 28 insertions(+), 21 deletions(-)
diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc
index 7af3873ba..a1cf87ac8 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -3845,14 +3845,17 @@ class PickLeaderReplica : public TSPicker {
//
// The target tablet server is refreshed before each RPC by consulting the provided
// TSPicker implementation.
+// Each created RetryingTSRpcTask should be added to TableInfo::pending_tasks_ by
+// calling TableInfo::AddTask(), so 'table' must remain valid for the lifetime of
+// this class.
class RetryingTSRpcTask : public MonitoredTask {
public:
- RetryingTSRpcTask(Master *master,
+ RetryingTSRpcTask(Master* master,
unique_ptr<TSPicker> replica_picker,
- scoped_refptr<TableInfo> table)
+ TableInfo* table)
: master_(master),
replica_picker_(std::move(replica_picker)),
- table_(std::move(table)),
+ table_(table),
start_ts_(MonoTime::Now()),
deadline_(start_ts_ + MonoDelta::FromMilliseconds(FLAGS_unresponsive_ts_rpc_timeout_ms)),
attempt_(0),
@@ -3876,7 +3879,7 @@ class RetryingTSRpcTask : public MonitoredTask {
MonoTime start_timestamp() const override { return start_ts_; }
MonoTime completion_timestamp() const override { return end_ts_; }
- const scoped_refptr<TableInfo>& table() const { return table_ ; }
+ TableInfo* table() const { return table_; }
protected:
// Send an RPC request and register a callback.
@@ -3919,7 +3922,8 @@ class RetryingTSRpcTask : public MonitoredTask {
Master * const master_;
const unique_ptr<TSPicker> replica_picker_;
- const scoped_refptr<TableInfo> table_;
+ // RetryingTSRpcTask is owned by 'TableInfo', so the backpointer should be raw.
+ TableInfo* table_;
MonoTime start_ts_;
MonoTime end_ts_;
@@ -4091,7 +4095,7 @@ class RetrySpecificTSRpcTask : public RetryingTSRpcTask {
public:
RetrySpecificTSRpcTask(Master* master,
const string& permanent_uuid,
- const scoped_refptr<TableInfo>& table)
+ TableInfo* table)
: RetryingTSRpcTask(master,
unique_ptr<TSPicker>(new PickSpecificUUID(permanent_uuid)),
table),
@@ -4113,7 +4117,7 @@ class AsyncCreateReplica : public RetrySpecificTSRpcTask {
const string& permanent_uuid,
const scoped_refptr<TabletInfo>& tablet,
const TabletMetadataLock& tablet_lock)
- : RetrySpecificTSRpcTask(master, permanent_uuid, tablet->table()),
+ : RetrySpecificTSRpcTask(master, permanent_uuid, tablet->table().get()),
tablet_id_(tablet->id()) {
deadline_ = start_ts_ + MonoDelta::FromMilliseconds(FLAGS_tablet_creation_timeout_ms);
@@ -4179,17 +4183,17 @@ class AsyncCreateReplica : public RetrySpecificTSRpcTask {
// Send a DeleteTablet() RPC request.
class AsyncDeleteReplica : public RetrySpecificTSRpcTask {
public:
- AsyncDeleteReplica(
- Master* master, const string& permanent_uuid,
- const scoped_refptr<TableInfo>& table, string tablet_id,
- TabletDataState delete_type,
- optional<int64_t> cas_config_opid_index_less_or_equal,
- string reason)
+ AsyncDeleteReplica(Master* master,
+ const string& permanent_uuid,
+ TableInfo* table,
+ string tablet_id,
+ TabletDataState delete_type,
+ optional<int64_t> cas_config_opid_index_less_or_equal,
+ string reason)
: RetrySpecificTSRpcTask(master, permanent_uuid, table),
tablet_id_(std::move(tablet_id)),
delete_type_(delete_type),
- cas_config_opid_index_less_or_equal_(
- std::move(cas_config_opid_index_less_or_equal)),
+ cas_config_opid_index_less_or_equal_(std::move(cas_config_opid_index_less_or_equal)),
reason_(std::move(reason)) {}
string type_name() const override {
@@ -4294,7 +4298,7 @@ class AsyncAlterTable : public RetryingTSRpcTask {
scoped_refptr<TabletInfo> tablet)
: RetryingTSRpcTask(master,
unique_ptr<TSPicker>(new PickLeaderReplica(tablet)),
- tablet->table()),
+ tablet->table().get()),
tablet_(std::move(tablet)) {
}
@@ -4394,7 +4398,7 @@ AsyncChangeConfigTask::AsyncChangeConfigTask(Master* master,
consensus::ChangeConfigType change_config_type)
: RetryingTSRpcTask(master,
unique_ptr<TSPicker>(new PickLeaderReplica(tablet)),
- tablet->table()),
+ tablet->table().get()),
tablet_(std::move(tablet)),
cstate_(std::move(cstate)),
change_config_type_(change_config_type) {
@@ -4759,7 +4763,7 @@ Status CatalogManager::ProcessTabletReport(
// TODO(unknown): Cancel tablet creation, instead of deleting, in cases
// where that might be possible (tablet creation timeout & replacement).
rpcs.emplace_back(new AsyncDeleteReplica(
- master_, ts_desc->permanent_uuid(), table, tablet_id,
+ master_, ts_desc->permanent_uuid(), table.get(), tablet_id,
TABLET_DATA_DELETED, none, msg));
continue;
}
@@ -4786,7 +4790,7 @@ Status CatalogManager::ProcessTabletReport(
"Replica has no consensus available" :
Substitute("Replica with old config index $0", report_opid_index);
rpcs.emplace_back(new AsyncDeleteReplica(
- master_, ts_desc->permanent_uuid(), table, tablet_id,
+ master_, ts_desc->permanent_uuid(), table.get(), tablet_id,
TABLET_DATA_TOMBSTONED, prev_opid_index,
Substitute("$0 (current committed config index is $1)",
delete_msg, prev_opid_index)));
@@ -4909,7 +4913,7 @@ Status CatalogManager::ProcessTabletReport(
const string& peer_uuid = p.permanent_uuid();
if (!ContainsKey(current_member_uuids, peer_uuid)) {
rpcs.emplace_back(new AsyncDeleteReplica(
- master_, peer_uuid, table, tablet_id,
+ master_, peer_uuid, table.get(), tablet_id,
TABLET_DATA_TOMBSTONED, prev_cstate.committed_config().opid_index(),
Substitute("TS $0 not found in new config with opid_index $1",
peer_uuid, cstate.committed_config().opid_index())));
@@ -5166,7 +5170,7 @@ void CatalogManager::SendDeleteTabletRequest(const scoped_refptr<TabletInfo>& ta
<< " replicas of tablet " << tablet->id();
for (const auto& peer : cstate.committed_config().peers()) {
scoped_refptr<AsyncDeleteReplica> task = new AsyncDeleteReplica(
- master_, peer.permanent_uuid(), tablet->table(), tablet->id(),
+ master_, peer.permanent_uuid(), tablet->table().get(), tablet->id(),
TABLET_DATA_DELETED, none, deletion_msg);
tablet->table()->AddTask(tablet->id(), task);
WARN_NOT_OK(task->Run(), Substitute(
@@ -6513,6 +6517,9 @@ void PersistentTabletInfo::set_state(SysTabletsEntryPB::State state, const strin
TableInfo::TableInfo(string table_id) : table_id_(std::move(table_id)) {}
TableInfo::~TableInfo() {
+ // Abort and wait for all pending tasks completed.
+ AbortTasks();
+ WaitTasksCompletion();
}
string TableInfo::ToString() const {