You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2017/11/27 06:16:28 UTC

[4/6] kudu git commit: make registration of maintenance ops thread-safe

make registration of maintenance ops thread-safe

Before, access to tablet replicas' maintenance ops was not thread-safe.
This was OK because the TSTabletManager externally synchronizes the
initialization and shutdown of replicas, enabling relatively lax locking
for the ops.

In the future, there will be calls that access the maintenance ops
outside of initialization and shutdown, e.g. from some external thread
to handle disk failures by canceling the affected tablets' maintenance
ops. To do so, more stringent locking is needed.

This patch uses TabletReplica's lock_ to synchronize access to the ops.

Change-Id: Ia13051fab85d0f678bd3efdcb69766d66a657cdd
Reviewed-on: http://gerrit.cloudera.org:8080/8635
Reviewed-by: Andrew Wong <aw...@cloudera.com>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/5523b0a3
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/5523b0a3
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/5523b0a3

Branch: refs/heads/master
Commit: 5523b0a307133e5e6827f56d9e6045609936a039
Parents: b390433
Author: Andrew Wong <aw...@cloudera.com>
Authored: Wed Nov 22 16:00:27 2017 -0800
Committer: Andrew Wong <aw...@cloudera.com>
Committed: Thu Nov 23 03:48:21 2017 +0000

----------------------------------------------------------------------
 src/kudu/tablet/tablet_replica.cc | 30 +++++++++++++++++++++---------
 src/kudu/tablet/tablet_replica.h  |  7 +++----
 2 files changed, 24 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/5523b0a3/src/kudu/tablet/tablet_replica.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_replica.cc b/src/kudu/tablet/tablet_replica.cc
index 05c8e79..d44e6b4 100644
--- a/src/kudu/tablet/tablet_replica.cc
+++ b/src/kudu/tablet/tablet_replica.cc
@@ -661,7 +661,7 @@ Status TabletReplica::NewReplicaTransactionDriver(gscoped_ptr<Transaction> trans
 void TabletReplica::RegisterMaintenanceOps(MaintenanceManager* maint_mgr) {
   // Taking state_change_lock_ ensures that we don't shut down concurrently with
   // this last start-up task.
-  std::lock_guard<simple_spinlock> l(state_change_lock_);
+  std::lock_guard<simple_spinlock> state_change_lock(state_change_lock_);
 
   if (state() != RUNNING) {
     LOG(WARNING) << "Not registering maintenance operations for " << tablet_
@@ -669,25 +669,32 @@ void TabletReplica::RegisterMaintenanceOps(MaintenanceManager* maint_mgr) {
     return;
   }
 
-  DCHECK(maintenance_ops_.empty());
+  vector<MaintenanceOp*> maintenance_ops;
 
   gscoped_ptr<MaintenanceOp> mrs_flush_op(new FlushMRSOp(this));
   maint_mgr->RegisterOp(mrs_flush_op.get());
-  maintenance_ops_.push_back(mrs_flush_op.release());
+  maintenance_ops.push_back(mrs_flush_op.release());
 
   gscoped_ptr<MaintenanceOp> dms_flush_op(new FlushDeltaMemStoresOp(this));
   maint_mgr->RegisterOp(dms_flush_op.get());
-  maintenance_ops_.push_back(dms_flush_op.release());
+  maintenance_ops.push_back(dms_flush_op.release());
 
   gscoped_ptr<MaintenanceOp> log_gc(new LogGCOp(this));
   maint_mgr->RegisterOp(log_gc.get());
-  maintenance_ops_.push_back(log_gc.release());
+  maintenance_ops.push_back(log_gc.release());
 
-  tablet_->RegisterMaintenanceOps(maint_mgr);
+  std::shared_ptr<Tablet> tablet;
+  {
+    std::lock_guard<simple_spinlock> l(lock_);
+    DCHECK(maintenance_ops_.empty());
+    maintenance_ops_.swap(maintenance_ops);
+    tablet = tablet_;
+  }
+  tablet->RegisterMaintenanceOps(maint_mgr);
 }
 
 void TabletReplica::CancelMaintenanceOpsForTests() {
-  std::lock_guard<simple_spinlock> l(state_change_lock_);
+  std::lock_guard<simple_spinlock> l(lock_);
   for (MaintenanceOp* op : maintenance_ops_) {
     op->CancelAndDisable();
   }
@@ -695,10 +702,15 @@ void TabletReplica::CancelMaintenanceOpsForTests() {
 
 void TabletReplica::UnregisterMaintenanceOps() {
   DCHECK(state_change_lock_.is_locked());
-  for (MaintenanceOp* op : maintenance_ops_) {
+  vector<MaintenanceOp*> maintenance_ops;
+  {
+    std::lock_guard<simple_spinlock> l(lock_);
+    maintenance_ops.swap(maintenance_ops_);
+  }
+  for (MaintenanceOp* op : maintenance_ops) {
     op->Unregister();
   }
-  STLDeleteElements(&maintenance_ops_);
+  STLDeleteElements(&maintenance_ops);
 }
 
 size_t TabletReplica::OnDiskSize() const {

http://git-wip-us.apache.org/repos/asf/kudu/blob/5523b0a3/src/kudu/tablet/tablet_replica.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_replica.h b/src/kudu/tablet/tablet_replica.h
index 5cc7c9f..27829ba 100644
--- a/src/kudu/tablet/tablet_replica.h
+++ b/src/kudu/tablet/tablet_replica.h
@@ -265,10 +265,9 @@ class TabletReplica : public RefCountedThreadSafe<TabletReplica>,
 
   // Register the maintenance ops associated with this peer's tablet, also invokes
   // Tablet::RegisterMaintenanceOps().
-  void RegisterMaintenanceOps(MaintenanceManager* maintenance_manager);
+  void RegisterMaintenanceOps(MaintenanceManager* maint_mgr);
 
   // Unregister the maintenance ops associated with this replica's tablet.
-  // This method is not thread safe.
   void UnregisterMaintenanceOps();
 
   // Cancels the maintenance ops associated with this replica's tablet.
@@ -337,8 +336,8 @@ class TabletReplica : public RefCountedThreadSafe<TabletReplica>,
   std::shared_ptr<consensus::RaftConsensus> consensus_;
   simple_spinlock prepare_replicate_lock_;
 
-  // Lock protecting state_, last_status_, as well as smart pointers to collaborating
-  // classes such as tablet_ and consensus_.
+  // Lock protecting state_, last_status_, as well as pointers to collaborating
+  // classes such as tablet_, consensus_, and maintenance_ops_.
   mutable simple_spinlock lock_;
 
   // The human-readable last status of the tablet, displayed on the web page, command line