You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2016/08/16 18:41:40 UTC

[1/2] kudu git commit: mini_cluster: avoid SIGSEGV when master fails to start up

Repository: kudu
Updated Branches:
  refs/heads/master 60f785160 -> a5a192a48


mini_cluster: avoid SIGSEGV when master fails to start up

My machine's clock became desynchronized causing MiniCluster-based tests to
SIGSEGV. The root cause appears to be in how we manage the mini_masters_
vector. By resizing it right at the beginning, various code paths later on
may iterate on some number of non-existent masters and dereference empty
shared pointers.

Change-Id: Id1cc262ba83f70d97434fca4c75f76b095db77d6
Reviewed-on: http://gerrit.cloudera.org:8080/4006
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <to...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/f7201a93
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/f7201a93
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/f7201a93

Branch: refs/heads/master
Commit: f7201a93294dd41f03a0a39f6ba38c969d50d6d7
Parents: 60f7851
Author: Adar Dembo <ad...@cloudera.com>
Authored: Tue Aug 16 11:02:42 2016 -0700
Committer: Adar Dembo <ad...@cloudera.com>
Committed: Tue Aug 16 18:41:10 2016 +0000

----------------------------------------------------------------------
 src/kudu/integration-tests/mini_cluster.cc | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/f7201a93/src/kudu/integration-tests/mini_cluster.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/mini_cluster.cc b/src/kudu/integration-tests/mini_cluster.cc
index 78646ee..626ed2c 100644
--- a/src/kudu/integration-tests/mini_cluster.cc
+++ b/src/kudu/integration-tests/mini_cluster.cc
@@ -62,7 +62,6 @@ MiniCluster::MiniCluster(Env* env, const MiniClusterOptions& options)
     num_ts_initial_(options.num_tablet_servers),
     master_rpc_ports_(options.master_rpc_ports),
     tserver_rpc_ports_(options.tserver_rpc_ports) {
-  mini_masters_.resize(num_masters_initial_);
 }
 
 MiniCluster::~MiniCluster() {
@@ -115,7 +114,7 @@ Status MiniCluster::StartDistributedMasters() {
                           Substitute("Couldn't start follower $0", i));
     VLOG(1) << "Started MiniMaster with UUID " << mini_master->permanent_uuid()
             << " at index " << i;
-    mini_masters_[i] = shared_ptr<MiniMaster>(mini_master.release());
+    mini_masters_.push_back(shared_ptr<MiniMaster>(mini_master.release()));
   }
   int i = 0;
   for (const shared_ptr<MiniMaster>& master : mini_masters_) {
@@ -138,8 +137,7 @@ Status MiniCluster::StartSync() {
 }
 
 Status MiniCluster::StartSingleMaster() {
-  // If there's a single master, 'mini_masters_' must be size 1.
-  CHECK_EQ(mini_masters_.size(), 1);
+  CHECK_EQ(1, num_masters_initial_);
   CHECK_LE(master_rpc_ports_.size(), 1);
   uint16_t master_rpc_port = 0;
   if (master_rpc_ports_.size() == 1) {
@@ -152,7 +150,7 @@ Status MiniCluster::StartSingleMaster() {
   RETURN_NOT_OK_PREPEND(mini_master->Start(), "Couldn't start master");
   RETURN_NOT_OK(mini_master->master()->
       WaitUntilCatalogManagerIsLeaderAndReadyForTests(MonoDelta::FromSeconds(5)));
-  mini_masters_[0] = shared_ptr<MiniMaster>(mini_master.release());
+  mini_masters_.push_back(shared_ptr<MiniMaster>(mini_master.release()));
   return Status::OK();
 }
 
@@ -184,10 +182,7 @@ void MiniCluster::Shutdown() {
     tablet_server->Shutdown();
   }
   mini_tablet_servers_.clear();
-  for (shared_ptr<MiniMaster>& master_server : mini_masters_) {
-    master_server->Shutdown();
-    master_server.reset();
-  }
+  ShutdownMasters();
   running_ = false;
 }
 
@@ -196,6 +191,7 @@ void MiniCluster::ShutdownMasters() {
     master_server->Shutdown();
     master_server.reset();
   }
+  mini_masters_.clear();
 }
 
 MiniMaster* MiniCluster::mini_master(int idx) {


[2/2] kudu git commit: catalog_manager: avoid race in InitSysCatalogAsync() and GetTabletPeer()

Posted by ad...@apache.org.
catalog_manager: avoid race in InitSysCatalogAsync() and GetTabletPeer()

Commit 2525ad0 took a stab at this, but it doesn't handle the case where
InitSysCatalogAsync() fails and leaves behind sys_catalog_ without a
functional tablet peer, as in the new integration test
MasterReplicationTest.TestMasterPeerSetsDontMatch. So here's another
attempt, where sys_catalog_ is only set when it is fully formed (i.e. when
it has a functional TabletPeer).

It turns out this isn't enough; we also need to prevent ElectedAsLeaderCb
from making progress until InitSysCatalogAsync() sets sys_catalog_. The
extra lock acquisition is hacky in that it doesn't explicitly protect
anything, but it gets the job done.

Below I've included test output when the race hits.

master_replication-itest: /home/jenkins-slave/workspace/kudu-3/src/kudu/gutil/ref_counted.h:273: T *scoped_refptr<kudu::tablet::TabletPeer>::operator->() const [T = kudu::tablet::TabletPeer]: Assertion `ptr_ != __null' failed.
*** Aborted at 1471309445 (unix time) try "date -d @1471309445" if you are using GNU date ***
PC: @     0x7f330225dcc9 gsignal
*** SIGABRT (@0x3e800006e90) received by PID 28304 (TID 0x7f32f06eb700) from PID 28304; stack trace: ***
    @           0x42e687 __tsan::CallUserSignalHandler() at /home/jenkins-slave/workspace/kudu-3/thirdparty/llvm-3.8.0.src/projects/compiler-rt/lib/tsan/rtl/tsan_interceptors.cc:1962
    @           0x42f4d3 rtl_sigaction() at /home/jenkins-slave/workspace/kudu-3/thirdparty/llvm-3.8.0.src/projects/compiler-rt/lib/tsan/rtl/tsan_interceptors.cc:2039
    @     0x7f33090a4340 (unknown) at ??:0
    @     0x7f330225dcc9 gsignal at ??:0
    @     0x7f33022610d8 abort at ??:0
    @     0x7f3302256b86 (unknown) at ??:0
    @     0x7f3302256c32 __assert_fail at ??:0
    @     0x7f330ca13130 scoped_refptr<>::operator->() at ??:0
    @     0x7f330ca1a952 kudu::master::SysCatalogTable::tablet_id() at ??:0
    @     0x7f330ca0b136 kudu::master::CatalogManager::GetTabletPeer() at ??:0
    @     0x7f330c69214d kudu::tserver::(anonymous namespace)::LookupTabletPeerOrRespond<>() at ??:0
    @     0x7f330c691bab kudu::tserver::ConsensusServiceImpl::RequestConsensusVote() at ??:0
    @     0x7f3307c9fca5 kudu::consensus::ConsensusServiceIf::ConsensusServiceIf()::$_1::operator()() at ??:0
    @     0x7f3307c9fabf std::_Function_handler<>::_M_invoke() at ??:0
    @     0x7f3306bd7219 std::function<>::operator()() at ??:0
    @     0x7f3306bd6c8e kudu::rpc::GeneratedServiceIf::Handle() at ??:0
    @     0x7f3306bd8b3e kudu::rpc::ServicePool::RunThread() at ??:0
    @     0x7f3306bdaa27 boost::_mfi::mf0<>::operator()() at ??:0
    @     0x7f3306bda98b boost::_bi::list1<>::operator()<>() at ??:0
    @     0x7f3306bda934 boost::_bi::bind_t<>::operator()() at ??:0
    @     0x7f3306bda75a boost::detail::function::void_function_obj_invoker0<>::invoke() at ??:0
    @     0x7f3306b758b2 boost::function0<>::operator()() at ??:0
    @     0x7f3304962630 kudu::Thread::SuperviseThread() at ??:0

Change-Id: I43fdc6499cb84d2053bed08b689fe5a08a6761d6
Reviewed-on: http://gerrit.cloudera.org:8080/3997
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <to...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/a5a192a4
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/a5a192a4
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/a5a192a4

Branch: refs/heads/master
Commit: a5a192a48f12cc8ae87ad3c7568d41bf6e657d0b
Parents: f7201a9
Author: Adar Dembo <ad...@cloudera.com>
Authored: Mon Aug 15 18:27:48 2016 -0700
Committer: Adar Dembo <ad...@cloudera.com>
Committed: Tue Aug 16 18:41:11 2016 +0000

----------------------------------------------------------------------
 src/kudu/master/catalog_manager.cc | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/a5a192a4/src/kudu/master/catalog_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc
index 6d756fe..b11218e 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -618,6 +618,10 @@ Status CatalogManager::WaitUntilCaughtUpAsLeader(const MonoDelta& timeout) {
 }
 
 void CatalogManager::VisitTablesAndTabletsTask() {
+  {
+    // Hack to block this function until InitSysCatalogAsync() is finished.
+    shared_lock<LockType> l(lock_);
+  }
   Consensus* consensus = sys_catalog_->tablet_peer()->consensus();
   int64_t term = consensus->ConsensusState(CONSENSUS_CONFIG_COMMITTED).current_term();
   {
@@ -693,15 +697,17 @@ Status CatalogManager::VisitTablesAndTablets() {
 
 Status CatalogManager::InitSysCatalogAsync(bool is_first_run) {
   std::lock_guard<LockType> l(lock_);
-  sys_catalog_.reset(new SysCatalogTable(master_,
-                                         master_->metric_registry(),
-                                         Bind(&CatalogManager::ElectedAsLeaderCb,
-                                              Unretained(this))));
+  unique_ptr<SysCatalogTable> new_catalog(
+      new SysCatalogTable(master_,
+                          master_->metric_registry(),
+                          Bind(&CatalogManager::ElectedAsLeaderCb,
+                               Unretained(this))));
   if (is_first_run) {
-    RETURN_NOT_OK(sys_catalog_->CreateNew(master_->fs_manager()));
+    RETURN_NOT_OK(new_catalog->CreateNew(master_->fs_manager()));
   } else {
-    RETURN_NOT_OK(sys_catalog_->Load(master_->fs_manager()));
+    RETURN_NOT_OK(new_catalog->Load(master_->fs_manager()));
   }
+  sys_catalog_.reset(new_catalog.release());
   return Status::OK();
 }