You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2016/03/01 01:56:07 UTC

incubator-kudu git commit: KUDU-1354. Writes should not release locks before committing MVCC

Repository: incubator-kudu
Updated Branches:
  refs/heads/master 9d2b7a8bb -> 470aeb069


KUDU-1354. Writes should not release locks before committing MVCC

Previous to this commit, write operations released their row locks before
they marked their MVCC transaction as "committed". This could result in
the following interleaving when two writes operated on the same row:

T1:                     T2:
- acquire row lock
- acquire timestamp 1
- apply write
- release row lock
                        - acquire row lock
                        - acquire timestamp 2
                        - apply write
                        - release row lock
                        - commit MVCC txn 2
- commit MVCC txn 1

If a reader acquired an MVCC snapshot in after T2 committed, but
before T1, then its scanner could see invalid operations, such
as a delete or update for a row that was not yet inserted.

In the case of a flush taking its snapshot during this window of time,
it was possible for the "phase 1" of the flush to include T2, while
"phase 2" included T1. This would result in T2 being written to an
earlier-indexed REDO delta file compared to T1, which would cause a
CHECK failure when that rowset was later read or compacted.

This reduced alter_table-randomized-test from failing about 1% of the time
to less than 0.1% [1]. The remaining crashes are due to trying to open empty
LogBlockManager metadata files (KUDU-668). (test results are reported on top
of WIP fixes for KUDU-969 and KUDU-1341, but rebased this patch to come
before those)

[1] http://dist-test.cloudera.org//job?job_id=todd.1456690849.21084

Change-Id: Ie715e94f7b7e4848213d41c6837af574f5fa5d48
Reviewed-on: http://gerrit.cloudera.org:8080/2342
Tested-by: Todd Lipcon <to...@apache.org>
Reviewed-by: David Ribeiro Alves <da...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/470aeb06
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/470aeb06
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/470aeb06

Branch: refs/heads/master
Commit: 470aeb06905f7ff8d6647609e0c44c2bd1dd60c8
Parents: 9d2b7a8
Author: Todd Lipcon <to...@apache.org>
Authored: Sun Feb 28 11:58:55 2016 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Tue Mar 1 00:55:47 2016 +0000

----------------------------------------------------------------------
 src/kudu/tablet/local_tablet_writer.h           |  4 +-
 src/kudu/tablet/transactions/transaction.h      |  6 ---
 .../tablet/transactions/transaction_driver.cc   | 10 ++--
 .../tablet/transactions/write_transaction.cc    | 56 ++++++++------------
 .../tablet/transactions/write_transaction.h     | 20 ++++---
 5 files changed, 38 insertions(+), 58 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/470aeb06/src/kudu/tablet/local_tablet_writer.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/local_tablet_writer.h b/src/kudu/tablet/local_tablet_writer.h
index 3a6a645..0e94272 100644
--- a/src/kudu/tablet/local_tablet_writer.h
+++ b/src/kudu/tablet/local_tablet_writer.h
@@ -99,9 +99,7 @@ class LocalTabletWriter {
     tablet_->ApplyRowOperations(tx_state_.get());
 
     tx_state_->ReleaseTxResultPB(&result_);
-    tx_state_->Commit();
-    tx_state_->release_row_locks();
-    tx_state_->ReleaseSchemaLock();
+    tx_state_->CommitOrAbort(Transaction::COMMITTED);
 
     // Return the status of first failed op.
     int op_idx = 0;

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/470aeb06/src/kudu/tablet/transactions/transaction.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/transactions/transaction.h b/src/kudu/tablet/transactions/transaction.h
index 9d2aa25..5774efb 100644
--- a/src/kudu/tablet/transactions/transaction.h
+++ b/src/kudu/tablet/transactions/transaction.h
@@ -102,12 +102,6 @@ class Transaction {
   // method where data-structures are changed.
   virtual Status Apply(gscoped_ptr<consensus::CommitMsg>* commit_msg) = 0;
 
-  // Executed after Apply() but before the commit is submitted to consensus.
-  // Some transactions use this to perform pre-commit actions (e.g. write
-  // transactions perform early lock release on this hook).
-  // Default implementation does nothing.
-  virtual void PreCommit() {}
-
   // Executed after the transaction has been applied and the commit message has
   // been appended to the log (though it might not be durable yet), or if the
   // transaction was aborted.

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/470aeb06/src/kudu/tablet/transactions/transaction_driver.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/transactions/transaction_driver.cc b/src/kudu/tablet/transactions/transaction_driver.cc
index 71558a5..3aa191a 100644
--- a/src/kudu/tablet/transactions/transaction_driver.cc
+++ b/src/kudu/tablet/transactions/transaction_driver.cc
@@ -367,6 +367,11 @@ void TransactionDriver::ApplyTask() {
     commit_msg->mutable_commited_op_id()->CopyFrom(op_id_copy_);
     SetResponseTimestamp(transaction_->state(), transaction_->state()->timestamp());
 
+    {
+      TRACE_EVENT1("txn", "AsyncAppendCommit", "txn", this);
+      CHECK_OK(log_->AsyncAppendCommit(std::move(commit_msg), Bind(DoNothingStatusCB)));
+    }
+
     // If the client requested COMMIT_WAIT as the external consistency mode
     // calculate the latest that the prepare timestamp could be and wait
     // until now.earliest > prepare_latest. Only after this are the locks
@@ -381,11 +386,6 @@ void TransactionDriver::ApplyTask() {
       CHECK_OK(CommitWait());
     }
 
-    transaction_->PreCommit();
-    {
-      TRACE_EVENT1("txn", "AsyncAppendCommit", "txn", this);
-      CHECK_OK(log_->AsyncAppendCommit(std::move(commit_msg), Bind(DoNothingStatusCB)));
-    }
     Finalize();
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/470aeb06/src/kudu/tablet/transactions/write_transaction.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/transactions/write_transaction.cc b/src/kudu/tablet/transactions/write_transaction.cc
index 39f960b..844fb32 100644
--- a/src/kudu/tablet/transactions/write_transaction.cc
+++ b/src/kudu/tablet/transactions/write_transaction.cc
@@ -146,27 +146,19 @@ Status WriteTransaction::Apply(gscoped_ptr<CommitMsg>* commit_msg) {
   return Status::OK();
 }
 
-void WriteTransaction::PreCommit() {
-  TRACE_EVENT0("txn", "WriteTransaction::PreCommit");
-  TRACE("PRECOMMIT: Releasing row and schema locks");
-  // Perform early lock release after we've applied all changes
-  state()->release_row_locks();
-  state()->ReleaseSchemaLock();
-}
-
 void WriteTransaction::Finish(TransactionResult result) {
   TRACE_EVENT0("txn", "WriteTransaction::Finish");
+
+  state()->CommitOrAbort(result);
+
   if (PREDICT_FALSE(result == Transaction::ABORTED)) {
-    TRACE("FINISH: aborting transaction");
-    state()->Abort();
+    TRACE("FINISH: transaction aborted");
     return;
   }
 
   DCHECK_EQ(result, Transaction::COMMITTED);
-  // Now that all of the changes have been applied and the commit is durable
-  // make the changes visible to readers.
-  TRACE("FINISH: making edits visible");
-  state()->Commit();
+
+  TRACE("FINISH: updating metrics");
 
   TabletMetrics* metrics = state_->tablet_peer()->tablet()->metrics();
   if (metrics) {
@@ -256,29 +248,27 @@ void WriteTransactionState::StartApplying() {
   CHECK_NOTNULL(mvcc_tx_.get())->StartApplying();
 }
 
-void WriteTransactionState::Abort() {
+void WriteTransactionState::CommitOrAbort(Transaction::TransactionResult result) {
   if (mvcc_tx_.get() != nullptr) {
-    // Abort the transaction.
-    mvcc_tx_->Abort();
+    // Commit the transaction.
+    switch (result) {
+      case Transaction::COMMITTED:
+        mvcc_tx_->Commit();
+        break;
+      case Transaction::ABORTED:
+        mvcc_tx_->Abort();
+        break;
+    }
   }
   mvcc_tx_.reset();
 
-  release_row_locks();
+  TRACE("Releasing row and schema locks");
+  ReleaseRowLocks();
   ReleaseSchemaLock();
 
-  // After commiting, we may respond to the RPC and delete the
-  // original request, so null them out here.
-  ResetRpcFields();
-}
-void WriteTransactionState::Commit() {
-  if (mvcc_tx_.get() != nullptr) {
-    // Commit the transaction.
-    mvcc_tx_->Commit();
-  }
-  mvcc_tx_.reset();
-
-  // After commiting, we may respond to the RPC and delete the
-  // original request, so null them out here.
+  // After committing, if there is an RPC going on, the driver will respond to it.
+  // That will delete the RPC request and response objects. So, NULL them here
+  // so we don't read them again after they're deleted.
   ResetRpcFields();
 }
 
@@ -310,7 +300,7 @@ void WriteTransactionState::UpdateMetricsForOp(const RowOp& op) {
   }
 }
 
-void WriteTransactionState::release_row_locks() {
+void WriteTransactionState::ReleaseRowLocks() {
   // free the row locks
   for (RowOp* op : row_ops_) {
     op->row_lock.Release();
@@ -323,7 +313,7 @@ WriteTransactionState::~WriteTransactionState() {
 
 void WriteTransactionState::Reset() {
   // We likely shouldn't Commit() here. See KUDU-625.
-  Commit();
+  CommitOrAbort(Transaction::COMMITTED);
   tx_metrics_.Reset();
   timestamp_ = Timestamp::kInvalidTimestamp;
   tablet_components_ = nullptr;

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/470aeb06/src/kudu/tablet/transactions/write_transaction.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/transactions/write_transaction.h b/src/kudu/tablet/transactions/write_transaction.h
index 6ec52e2..aa3d05d 100644
--- a/src/kudu/tablet/transactions/write_transaction.h
+++ b/src/kudu/tablet/transactions/write_transaction.h
@@ -139,15 +139,16 @@ class WriteTransactionState : public TransactionState {
   // blocked on it).
   void StartApplying();
 
-  // Commits the Mvcc transaction and releases the component lock. After
-  // this method is called all the inserts and mutations will become
-  // visible to other transactions.
+  // Commits or aborts the MVCC transaction and releases all locks held.
   //
-  // Only one of Commit() or Abort() should be called.
+  // In the case of COMMITTED, this method makes the inserts and mutations performed
+  // by this transaction visible to other transactions.
+  //
+  // Must be called exactly once.
   // REQUIRES: StartApplying() was called.
   //
   // Note: request_ and response_ are set to NULL after this method returns.
-  void Commit();
+  void CommitOrAbort(Transaction::TransactionResult result);
 
   // Aborts the mvcc transaction and releases the component lock.
   // Only one of Commit() or Abort() should be called.
@@ -168,9 +169,6 @@ class WriteTransactionState : public TransactionState {
 
   void UpdateMetricsForOp(const RowOp& op);
 
-  // Releases all the row locks acquired by this transaction.
-  void release_row_locks();
-
   // Resets this TransactionState, releasing all locks, destroying all prepared
   // writes, clearing the transaction result _and_ committing the current Mvcc
   // transaction.
@@ -179,6 +177,9 @@ class WriteTransactionState : public TransactionState {
   virtual std::string ToString() const OVERRIDE;
 
  private:
+  // Releases all the row locks acquired by this transaction.
+  void ReleaseRowLocks();
+
   // Reset the RPC request, response, and row_ops_ (which refers to data
   // from the request).
   void ResetRpcFields();
@@ -252,9 +253,6 @@ class WriteTransaction : public Transaction {
   // algorithm.
   virtual Status Apply(gscoped_ptr<consensus::CommitMsg>* commit_msg) OVERRIDE;
 
-  // Releases the row locks (Early Lock Release).
-  virtual void PreCommit() OVERRIDE;
-
   // If result == COMMITTED, commits the mvcc transaction and updates
   // the metrics, if result == ABORTED aborts the mvcc transaction.
   virtual void Finish(TransactionResult result) OVERRIDE;