You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2022/05/16 17:11:03 UTC

[kudu] branch master updated: [test] fix ASAN test failures in master-stress-test

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new e2f0d8fb0 [test] fix ASAN test failures in master-stress-test
e2f0d8fb0 is described below

commit e2f0d8fb056d9834199ec2feab8888a30daefae7
Author: zhangyifan27 <ch...@163.com>
AuthorDate: Thu May 5 20:30:10 2022 +0800

    [test] fix ASAN test failures in master-stress-test
    
    The MasterStressTest occasionally failed in ASAN builds [1] [2]. I also built
    the test in ASAN mode on a CentOS 7.9 machine, ran it in slow mode 3 times and
    all failed.
    
    It seems that some RetryingTSRpcTask is still running after 'TableInfo' object
    destroyed if we set --enable_metadata_cleanup_for_deleted_tables_and_tablets=true.
    To fix heap-use-after-free errors, we should abort and wait all pending tasks
    completed before we destroy a TabletInfo object. With this fix the ASAN test
    can pass 10 times.
    
    [1] http://jenkins.kudu.apache.org/job/kudu-gerrit/25414/BUILD_TYPE=ASAN/
    [2] http://jenkins.kudu.apache.org/job/kudu-gerrit/25448/BUILD_TYPE=ASAN/
    
    Change-Id: I4eff45fbf05644362169485cd32678509eab1b07
    Reviewed-on: http://gerrit.cloudera.org:8080/18501
    Tested-by: Kudu Jenkins
    Reviewed-by: Alexey Serbin <al...@apache.org>
---
 src/kudu/master/catalog_manager.cc | 49 ++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc
index 7af3873ba..a1cf87ac8 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -3845,14 +3845,17 @@ class PickLeaderReplica : public TSPicker {
 //
 // The target tablet server is refreshed before each RPC by consulting the provided
 // TSPicker implementation.
+// Each created RetryingTSRpcTask should be added to TableInfo::pending_tasks_ by
+// calling TableInfo::AddTask(), so 'table' must remain valid for the lifetime of
+// this class.
 class RetryingTSRpcTask : public MonitoredTask {
  public:
-  RetryingTSRpcTask(Master *master,
+  RetryingTSRpcTask(Master* master,
                     unique_ptr<TSPicker> replica_picker,
-                    scoped_refptr<TableInfo> table)
+                    TableInfo* table)
     : master_(master),
       replica_picker_(std::move(replica_picker)),
-      table_(std::move(table)),
+      table_(table),
       start_ts_(MonoTime::Now()),
       deadline_(start_ts_ + MonoDelta::FromMilliseconds(FLAGS_unresponsive_ts_rpc_timeout_ms)),
       attempt_(0),
@@ -3876,7 +3879,7 @@ class RetryingTSRpcTask : public MonitoredTask {
 
   MonoTime start_timestamp() const override { return start_ts_; }
   MonoTime completion_timestamp() const override { return end_ts_; }
-  const scoped_refptr<TableInfo>& table() const { return table_ ; }
+  TableInfo* table() const { return table_; }
 
  protected:
   // Send an RPC request and register a callback.
@@ -3919,7 +3922,8 @@ class RetryingTSRpcTask : public MonitoredTask {
 
   Master * const master_;
   const unique_ptr<TSPicker> replica_picker_;
-  const scoped_refptr<TableInfo> table_;
+  // RetryingTSRpcTask is owned by 'TableInfo', so the backpointer should be raw.
+  TableInfo* table_;
 
   MonoTime start_ts_;
   MonoTime end_ts_;
@@ -4091,7 +4095,7 @@ class RetrySpecificTSRpcTask : public RetryingTSRpcTask {
  public:
   RetrySpecificTSRpcTask(Master* master,
                          const string& permanent_uuid,
-                         const scoped_refptr<TableInfo>& table)
+                         TableInfo* table)
     : RetryingTSRpcTask(master,
                         unique_ptr<TSPicker>(new PickSpecificUUID(permanent_uuid)),
                         table),
@@ -4113,7 +4117,7 @@ class AsyncCreateReplica : public RetrySpecificTSRpcTask {
                      const string& permanent_uuid,
                      const scoped_refptr<TabletInfo>& tablet,
                      const TabletMetadataLock& tablet_lock)
-    : RetrySpecificTSRpcTask(master, permanent_uuid, tablet->table()),
+    : RetrySpecificTSRpcTask(master, permanent_uuid, tablet->table().get()),
       tablet_id_(tablet->id()) {
     deadline_ = start_ts_ + MonoDelta::FromMilliseconds(FLAGS_tablet_creation_timeout_ms);
 
@@ -4179,17 +4183,17 @@ class AsyncCreateReplica : public RetrySpecificTSRpcTask {
 // Send a DeleteTablet() RPC request.
 class AsyncDeleteReplica : public RetrySpecificTSRpcTask {
  public:
-  AsyncDeleteReplica(
-      Master* master, const string& permanent_uuid,
-      const scoped_refptr<TableInfo>& table, string tablet_id,
-      TabletDataState delete_type,
-      optional<int64_t> cas_config_opid_index_less_or_equal,
-      string reason)
+  AsyncDeleteReplica(Master* master,
+                     const string& permanent_uuid,
+                     TableInfo* table,
+                     string tablet_id,
+                     TabletDataState delete_type,
+                     optional<int64_t> cas_config_opid_index_less_or_equal,
+                     string reason)
       : RetrySpecificTSRpcTask(master, permanent_uuid, table),
         tablet_id_(std::move(tablet_id)),
         delete_type_(delete_type),
-        cas_config_opid_index_less_or_equal_(
-            std::move(cas_config_opid_index_less_or_equal)),
+        cas_config_opid_index_less_or_equal_(std::move(cas_config_opid_index_less_or_equal)),
         reason_(std::move(reason)) {}
 
   string type_name() const override {
@@ -4294,7 +4298,7 @@ class AsyncAlterTable : public RetryingTSRpcTask {
                   scoped_refptr<TabletInfo> tablet)
     : RetryingTSRpcTask(master,
                         unique_ptr<TSPicker>(new PickLeaderReplica(tablet)),
-                        tablet->table()),
+                        tablet->table().get()),
       tablet_(std::move(tablet)) {
   }
 
@@ -4394,7 +4398,7 @@ AsyncChangeConfigTask::AsyncChangeConfigTask(Master* master,
                                              consensus::ChangeConfigType change_config_type)
     : RetryingTSRpcTask(master,
                         unique_ptr<TSPicker>(new PickLeaderReplica(tablet)),
-                        tablet->table()),
+                        tablet->table().get()),
       tablet_(std::move(tablet)),
       cstate_(std::move(cstate)),
       change_config_type_(change_config_type) {
@@ -4759,7 +4763,7 @@ Status CatalogManager::ProcessTabletReport(
       // TODO(unknown): Cancel tablet creation, instead of deleting, in cases
       // where that might be possible (tablet creation timeout & replacement).
       rpcs.emplace_back(new AsyncDeleteReplica(
-          master_, ts_desc->permanent_uuid(), table, tablet_id,
+          master_, ts_desc->permanent_uuid(), table.get(), tablet_id,
           TABLET_DATA_DELETED, none, msg));
       continue;
     }
@@ -4786,7 +4790,7 @@ Status CatalogManager::ProcessTabletReport(
           "Replica has no consensus available" :
           Substitute("Replica with old config index $0", report_opid_index);
       rpcs.emplace_back(new AsyncDeleteReplica(
-          master_, ts_desc->permanent_uuid(), table, tablet_id,
+          master_, ts_desc->permanent_uuid(), table.get(), tablet_id,
           TABLET_DATA_TOMBSTONED, prev_opid_index,
           Substitute("$0 (current committed config index is $1)",
                      delete_msg, prev_opid_index)));
@@ -4909,7 +4913,7 @@ Status CatalogManager::ProcessTabletReport(
             const string& peer_uuid = p.permanent_uuid();
             if (!ContainsKey(current_member_uuids, peer_uuid)) {
               rpcs.emplace_back(new AsyncDeleteReplica(
-                  master_, peer_uuid, table, tablet_id,
+                  master_, peer_uuid, table.get(), tablet_id,
                   TABLET_DATA_TOMBSTONED, prev_cstate.committed_config().opid_index(),
                   Substitute("TS $0 not found in new config with opid_index $1",
                              peer_uuid, cstate.committed_config().opid_index())));
@@ -5166,7 +5170,7 @@ void CatalogManager::SendDeleteTabletRequest(const scoped_refptr<TabletInfo>& ta
       << " replicas of tablet " << tablet->id();
   for (const auto& peer : cstate.committed_config().peers()) {
     scoped_refptr<AsyncDeleteReplica> task = new AsyncDeleteReplica(
-        master_, peer.permanent_uuid(), tablet->table(), tablet->id(),
+        master_, peer.permanent_uuid(), tablet->table().get(), tablet->id(),
         TABLET_DATA_DELETED, none, deletion_msg);
     tablet->table()->AddTask(tablet->id(), task);
     WARN_NOT_OK(task->Run(), Substitute(
@@ -6513,6 +6517,9 @@ void PersistentTabletInfo::set_state(SysTabletsEntryPB::State state, const strin
 TableInfo::TableInfo(string table_id) : table_id_(std::move(table_id)) {}
 
 TableInfo::~TableInfo() {
+  // Abort and wait for all pending tasks completed.
+  AbortTasks();
+  WaitTasksCompletion();
 }
 
 string TableInfo::ToString() const {