You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2019/11/01 00:25:03 UTC

[kudu] branch master updated: KUDU-2949: deflake raft_consensus_quorum-test

This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 87d90b2  KUDU-2949: deflake raft_consensus_quorum-test
87d90b2 is described below

commit 87d90b2e5a7b593164d3a13ae40bf1948c3ad914
Author: Adar Dembo <ad...@cloudera.com>
AuthorDate: Tue Oct 29 16:43:23 2019 -0700

    KUDU-2949: deflake raft_consensus_quorum-test
    
    One of my precommits failed with ThreadCollisionWarner triggering a crash in
    raft_consensus_quorum-test. Digging into it, the cause appears to be the
    "synthetic" nature that the test checks certain cmeta properties.
    
    Back in commit 17f97531e the real cmeta mutex was replaced with a
    DFAKE_MUTEX. This fake lock works by using the ThreadCollisionWarner to
    enforce that access to the "protected" object is externally synchronized;
    that is, another lock is always held while calling the object's methods. In
    cmeta's case, that external lock is the main RaftConsensus lock.
    
    The problem is that raft_consensus_quorum-test accesses cmeta directly,
    without going through RaftConsensus. This opens the door to crashes like
    these since there's no external synchronization in these accesses.
    
    I opted to fix this by changing the test's ReadConsensusMetadataFromDisk
    method to actually go to disk for its cmeta. I could have poked a few test
    holes into RaftConsensus, but it seemed odd for a method that purported to
    go "to disk" to actually get cmeta from the ConsensusMetadataManager, which
    caches it in a map and always returns the same instance.
    
    I also included some test cleanup.
    
      I1029 18:08:21.149920   222 raft_consensus.cc:2316] T TestTablet P 1894ac4aabfc4698b93d6c621f3c9ceb [term 1 FOLLOWER]: Leader election vote request: Denying vote to candidate c0fad19d10b74845b028ecce5cde3fbe for term 2 because replica is either leader or believes a valid leader to be alive.
      I1029 18:08:21.150341   222 raft_consensus.cc:2901] T TestTablet P 1894ac4aabfc4698b93d6c621f3c9ceb [term 1 FOLLOWER]: Advancing to term 2
      W1029 18:08:21.701403   222 consensus_meta.cc:220] T TestTablet P 1894ac4aabfc4698b93d6c621f3c9ceb: Time spent flushing consensus metadata: real 0.551s	user 0.001s	sys 0.000s
      I1029 18:08:21.701627   222 raft_consensus.cc:2355] T TestTablet P 1894ac4aabfc4698b93d6c621f3c9ceb [term 2 FOLLOWER]: Leader election vote request: Granting yes vote for candidate c0fad19d10b74845b028ecce5cde3fbe in term 2.
      F1029 18:08:21.701889   586 thread_collision_warner.cc:23] Thread Collision! Previous thread id: 222, current thread id: 586
      *** Check failure stack trace: ***
      *** Aborted at 1572372501 (unix time) try "date -d @1572372501" if you are using GNU date ***
      PC: @     0x7ff8b8713c37 gsignal
      *** SIGABRT (@0x3e8000000de) received by PID 222 (TID 0x7ff8a2b0e700) from PID 222; stack trace: ***
        @     0x7ff8b92d6330 (unknown) at ??:0
        @     0x7ff8b8713c37 gsignal at ??:0
        @     0x7ff8b8717028 abort at ??:0
        @     0x7ff8bb112e09 google::logging_fail() at ??:0
        @     0x7ff8bb11462d google::LogMessage::Fail() at ??:0
        @     0x7ff8bb11664c google::LogMessage::SendToLog() at ??:0
        @     0x7ff8bb114189 google::LogMessage::Flush() at ??:0
        @     0x7ff8bb116fdf google::LogMessageFatal::~LogMessageFatal() at ??:0
        @     0x7ff8bb4ef2d3 base::DCheckAsserter::warn() at ??:0
        @     0x7ff8bb4ef3e8 base::ThreadCollisionWarner::EnterSelf() at ??:0
        @     0x7ff8c6ecb86b kudu::consensus::ConsensusMetadata::current_term() at ??:0
        @     0x7ff8c6fa1632 kudu::consensus::RaftConsensus::CurrentTermUnlocked() at ??:0
        @     0x7ff8c6fc847e kudu::consensus::RaftConsensus::HandleLeaderRequestTermUnlocked() at ??:0
        @     0x7ff8c6fca887 kudu::consensus::RaftConsensus::CheckLeaderRequestUnlocked() at ??:0
        @     0x7ff8c6fbf7c7 kudu::consensus::RaftConsensus::UpdateReplica() at ??:0
        @     0x7ff8c6fbe5e2 kudu::consensus::RaftConsensus::Update() at ??:0
        @           0x6112b2 kudu::consensus::LocalTestPeerProxy::SendUpdateRequest() at /home/jenkins-slave/workspace/kudu-master/2/src/kudu/consensus/consensus-test-util.h:?
    
    Change-Id: Icacdef44fd545f46eb9f47c577641e3aa66d6e94
    Reviewed-on: http://gerrit.cloudera.org:8080/14580
    Tested-by: Adar Dembo <ad...@cloudera.com>
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
---
 src/kudu/consensus/consensus_meta.h              |  1 +
 src/kudu/consensus/raft_consensus.cc             |  2 +-
 src/kudu/consensus/raft_consensus.h              |  4 +-
 src/kudu/consensus/raft_consensus_quorum-test.cc | 59 +++++++++++-------------
 src/kudu/tablet/tablet_replica.cc                |  2 +-
 5 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/src/kudu/consensus/consensus_meta.h b/src/kudu/consensus/consensus_meta.h
index ee4f591..df24fd2 100644
--- a/src/kudu/consensus/consensus_meta.h
+++ b/src/kudu/consensus/consensus_meta.h
@@ -162,6 +162,7 @@ class ConsensusMetadata : public RefCountedThreadSafe<ConsensusMetadata> {
   }
 
  private:
+  friend class RaftConsensusQuorumTest;
   friend class RefCountedThreadSafe<ConsensusMetadata>;
   friend class ConsensusMetadataManager;
 
diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc
index 84f336e..4300ebf 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc
@@ -220,7 +220,7 @@ Status RaftConsensus::Create(ConsensusOptions options,
 }
 
 Status RaftConsensus::Start(const ConsensusBootstrapInfo& info,
-                            gscoped_ptr<PeerProxyFactory> peer_proxy_factory,
+                            unique_ptr<PeerProxyFactory> peer_proxy_factory,
                             scoped_refptr<log::Log> log,
                             scoped_refptr<TimeManager> time_manager,
                             ConsensusRoundHandler* round_handler,
diff --git a/src/kudu/consensus/raft_consensus.h b/src/kudu/consensus/raft_consensus.h
index b2afafb..a631c5a 100644
--- a/src/kudu/consensus/raft_consensus.h
+++ b/src/kudu/consensus/raft_consensus.h
@@ -145,7 +145,7 @@ class RaftConsensus : public std::enable_shared_from_this<RaftConsensus>,
   // Start() is not thread-safe. Calls to Start() should be externally
   // synchronized with calls accessing non-const members of this class.
   Status Start(const ConsensusBootstrapInfo& info,
-               gscoped_ptr<PeerProxyFactory> peer_proxy_factory,
+               std::unique_ptr<PeerProxyFactory> peer_proxy_factory,
                scoped_refptr<log::Log> log,
                scoped_refptr<TimeManager> time_manager,
                ConsensusRoundHandler* round_handler,
@@ -834,7 +834,7 @@ class RaftConsensus : public std::enable_shared_from_this<RaftConsensus>,
 
   scoped_refptr<log::Log> log_;
   scoped_refptr<TimeManager> time_manager_;
-  gscoped_ptr<PeerProxyFactory> peer_proxy_factory_;
+  std::unique_ptr<PeerProxyFactory> peer_proxy_factory_;
 
   // When we receive a message from a remote peer telling us to start a
   // transaction, or finish a round, we use this handler to handle it.
diff --git a/src/kudu/consensus/raft_consensus_quorum-test.cc b/src/kudu/consensus/raft_consensus_quorum-test.cc
index 1b66e43..abcdf5c 100644
--- a/src/kudu/consensus/raft_consensus_quorum-test.cc
+++ b/src/kudu/consensus/raft_consensus_quorum-test.cc
@@ -60,7 +60,6 @@
 #include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/ref_counted.h"
-#include "kudu/gutil/stl_util.h"
 #include "kudu/gutil/strings/strcat.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/tablet/metadata.pb.h"
@@ -147,14 +146,10 @@ class RaftConsensusQuorumTest : public KuduTest {
       opts.parent_mem_tracker = parent_mem_tracker;
       opts.wal_root = test_path;
       opts.data_roots = { test_path };
-      gscoped_ptr<FsManager> fs_manager(new FsManager(env_, opts));
+      unique_ptr<FsManager> fs_manager(new FsManager(env_, opts));
       RETURN_NOT_OK(fs_manager->CreateInitialFileSystemLayout());
       RETURN_NOT_OK(fs_manager->Open());
 
-      scoped_refptr<ConsensusMetadataManager> cmeta_manager(
-          new ConsensusMetadataManager(fs_manager.get()));
-      cmeta_managers_.push_back(cmeta_manager);
-
       scoped_refptr<Log> log;
       RETURN_NOT_OK(Log::Open(LogOptions(),
                               fs_manager.get(),
@@ -164,7 +159,7 @@ class RaftConsensusQuorumTest : public KuduTest {
                               nullptr,
                               &log));
       logs_.emplace_back(std::move(log));
-      fs_managers_.push_back(fs_manager.release());
+      fs_managers_.push_back(std::move(fs_manager));
     }
     return Status::OK();
   }
@@ -184,18 +179,21 @@ class RaftConsensusQuorumTest : public KuduTest {
   }
 
   Status BuildPeers() {
-    CHECK_EQ(config_.peers_size(), cmeta_managers_.size());
     CHECK_EQ(config_.peers_size(), fs_managers_.size());
     for (int i = 0; i < config_.peers_size(); i++) {
-      RETURN_NOT_OK(cmeta_managers_[i]->Create(kTestTablet, config_, kMinimumTerm));
+      FsManager* fs = fs_managers_[i].get();
+      scoped_refptr<ConsensusMetadataManager> cmeta_manager(
+          new ConsensusMetadataManager(fs));
+
+      RETURN_NOT_OK(cmeta_manager->Create(kTestTablet, config_, kMinimumTerm));
 
       RaftPeerPB* local_peer_pb;
-      RETURN_NOT_OK(GetRaftConfigMember(&config_, fs_managers_[i]->uuid(), &local_peer_pb));
+      RETURN_NOT_OK(GetRaftConfigMember(&config_, fs->uuid(), &local_peer_pb));
 
       shared_ptr<RaftConsensus> peer;
       RETURN_NOT_OK(RaftConsensus::Create(options_,
                                           config_.peers(i),
-                                          cmeta_managers_[i],
+                                          std::move(cmeta_manager),
                                           raft_pool_.get(),
                                           &peer));
       peers_->AddPeer(config_.peers(i).permanent_uuid(), peer);
@@ -211,18 +209,21 @@ class RaftConsensusQuorumTest : public KuduTest {
       shared_ptr<RaftConsensus> peer;
       RETURN_NOT_OK(peers_->GetPeerByIdx(i, &peer));
 
-      gscoped_ptr<PeerProxyFactory> proxy_factory(new LocalTestPeerProxyFactory(peers_.get()));
-      scoped_refptr<TimeManager> time_manager(new TimeManager(clock_, Timestamp::kMin));
-      auto txn_factory = new TestTransactionFactory(logs_[i].get());
+      unique_ptr<PeerProxyFactory> proxy_factory(
+          new LocalTestPeerProxyFactory(peers_.get()));
+      scoped_refptr<TimeManager> time_manager(
+          new TimeManager(clock_, Timestamp::kMin));
+      unique_ptr<TestTransactionFactory> txn_factory(
+          new TestTransactionFactory(logs_[i].get()));
       txn_factory->SetConsensus(peer.get());
-      txn_factories_.push_back(txn_factory);
+      txn_factories_.emplace_back(std::move(txn_factory));
 
       RETURN_NOT_OK(peer->Start(
           boot_info,
           std::move(proxy_factory),
           logs_[i],
           time_manager,
-          txn_factory,
+          txn_factories_.back().get(),
           metric_entity_,
           Bind(&DoNothing)));
     }
@@ -274,9 +275,9 @@ class RaftConsensusQuorumTest : public KuduTest {
     CHECK_OK(peers_->GetPeerByIdx(peer_idx, &peer));
 
     // Use a latch in place of a Transaction callback.
-    gscoped_ptr<Synchronizer> sync(new Synchronizer());
+    unique_ptr<Synchronizer> sync(new Synchronizer());
     *round = peer->NewRound(std::move(msg), sync->AsStdStatusCallback());
-    InsertOrDie(&syncs_, round->get(), sync.release());
+    EmplaceOrDie(&syncs_, round->get(), std::move(sync));
     RETURN_NOT_OK_PREPEND(peer->Replicate(round->get()),
                           Substitute("Unable to replicate to peer $0", peer_idx));
     return Status::OK();
@@ -429,7 +430,7 @@ class RaftConsensusQuorumTest : public KuduTest {
     ASSERT_OK(log->WaitUntilAllFlushed());
     log->Close();
     shared_ptr<LogReader> log_reader;
-    ASSERT_OK(log::LogReader::Open(fs_managers_[idx],
+    ASSERT_OK(log::LogReader::Open(fs_managers_[idx].get(),
                                    scoped_refptr<log::LogIndex>(),
                                    kTestTablet,
                                    metric_entity_.get(),
@@ -450,7 +451,7 @@ class RaftConsensusQuorumTest : public KuduTest {
   void VerifyLogs(int leader_idx, int first_replica_idx, int last_replica_idx) {
     // Wait for in-flight transactions to be done. We're destroying the
     // peers next and leader transactions won't be able to commit anymore.
-    for (TestTransactionFactory* factory : txn_factories_) {
+    for (const auto& factory : txn_factories_) {
       factory->WaitDone();
     }
 
@@ -543,8 +544,9 @@ class RaftConsensusQuorumTest : public KuduTest {
 
   // Read the ConsensusMetadata for the given peer from disk.
   scoped_refptr<ConsensusMetadata> ReadConsensusMetadataFromDisk(int peer_index) {
+    FsManager* fs = fs_managers_[peer_index].get();
     scoped_refptr<ConsensusMetadata> cmeta;
-    CHECK_OK(cmeta_managers_[peer_index]->Load(kTestTablet, &cmeta));
+    CHECK_OK(ConsensusMetadata::Load(fs, kTestTablet, fs->uuid(), &cmeta));
     return cmeta;
   }
 
@@ -564,12 +566,6 @@ class RaftConsensusQuorumTest : public KuduTest {
 
   ~RaftConsensusQuorumTest() {
     peers_->Clear();
-    STLDeleteElements(&txn_factories_);
-    // We need to clear the logs before deleting the fs_managers_ or we'll
-    // get a SIGSEGV when closing the logs.
-    logs_.clear();
-    STLDeleteElements(&fs_managers_);
-    STLDeleteValues(&syncs_);
   }
 
  protected:
@@ -577,17 +573,16 @@ class RaftConsensusQuorumTest : public KuduTest {
   RaftConfigPB config_;
   OpId initial_id_;
   vector<shared_ptr<MemTracker>> parent_mem_trackers_;
-  vector<FsManager*> fs_managers_;
+  vector<unique_ptr<FsManager>> fs_managers_;
   vector<scoped_refptr<Log> > logs_;
   unique_ptr<ThreadPool> raft_pool_;
-  vector<scoped_refptr<ConsensusMetadataManager>> cmeta_managers_;
-  gscoped_ptr<TestPeerMapManager> peers_;
-  vector<TestTransactionFactory*> txn_factories_;
+  unique_ptr<TestPeerMapManager> peers_;
+  vector<unique_ptr<TestTransactionFactory>> txn_factories_;
   scoped_refptr<clock::Clock> clock_;
   MetricRegistry metric_registry_;
   scoped_refptr<MetricEntity> metric_entity_;
   const Schema schema_;
-  std::unordered_map<ConsensusRound*, Synchronizer*> syncs_;
+  std::unordered_map<ConsensusRound*, unique_ptr<Synchronizer>> syncs_;
 };
 
 // Tests Replicate/Commit a single message through the leader.
diff --git a/src/kudu/tablet/tablet_replica.cc b/src/kudu/tablet/tablet_replica.cc
index 163585b..3999bdb 100644
--- a/src/kudu/tablet/tablet_replica.cc
+++ b/src/kudu/tablet/tablet_replica.cc
@@ -176,7 +176,7 @@ Status TabletReplica::Start(const ConsensusBootstrapInfo& bootstrap_info,
     std::lock_guard<simple_spinlock> state_change_guard(state_change_lock_);
 
     scoped_refptr<MetricEntity> metric_entity;
-    gscoped_ptr<PeerProxyFactory> peer_proxy_factory;
+    unique_ptr<PeerProxyFactory> peer_proxy_factory;
     scoped_refptr<TimeManager> time_manager;
     {
       std::lock_guard<simple_spinlock> l(lock_);