You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2016/12/05 01:58:24 UTC

[4/7] kudu git commit: raft_consensus-itest: add a test for a potential deadlock

raft_consensus-itest: add a test for a potential deadlock

This adds a test case where multiple pending operations have lock
dependencies on each other, and the replica needs to replace one that's
stuck waiting in the queue before it can make progress.

This catches a bug found in a revision of
https://gerrit.cloudera.org/#/c/5294/ which wasn't covered by existing
test cases.

Change-Id: Ie1143045780886958e45f667b1877b4e34e8b03e
Reviewed-on: http://gerrit.cloudera.org:8080/5341
Tested-by: Kudu Jenkins
Reviewed-by: David Ribeiro Alves <dr...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/44579480
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/44579480
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/44579480

Branch: refs/heads/master
Commit: 44579480490279ae0b7edbf6511f2b99b563e63b
Parents: 7e3071e
Author: Todd Lipcon <to...@apache.org>
Authored: Fri Dec 2 17:01:07 2016 -0800
Committer: David Ribeiro Alves <dr...@apache.org>
Committed: Sat Dec 3 01:57:39 2016 +0000

----------------------------------------------------------------------
 .../integration-tests/raft_consensus-itest.cc   | 74 +++++++++++++++++++-
 1 file changed, 72 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/44579480/src/kudu/integration-tests/raft_consensus-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest.cc b/src/kudu/integration-tests/raft_consensus-itest.cc
index 1bd618d..8d9c8d3 100644
--- a/src/kudu/integration-tests/raft_consensus-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest.cc
@@ -173,6 +173,11 @@ class RaftConsensusITest : public TabletServerIntegrationTestBase {
   // Add an Insert operation to the given consensus request.
   // The row to be inserted is generated based on the OpId.
   void AddOp(const OpId& id, ConsensusRequestPB* req);
+  void AddOpWithTypeAndKey(const OpId& id,
+                           RowOperationsPB::Type op_type,
+                           int32_t key,
+                           ConsensusRequestPB* req);
+
 
   string DumpToString(TServerDetails* leader,
                       const vector<string>& leader_results,
@@ -1174,6 +1179,14 @@ TEST_F(RaftConsensusITest, TestKUDU_597) {
 }
 
 void RaftConsensusITest::AddOp(const OpId& id, ConsensusRequestPB* req) {
+  AddOpWithTypeAndKey(id, RowOperationsPB::INSERT,
+                      id.index() * 10000 + id.term(), req);
+}
+
+void RaftConsensusITest::AddOpWithTypeAndKey(const OpId& id,
+                                             RowOperationsPB::Type op_type,
+                                             int32_t key,
+                                             ConsensusRequestPB* req) {
   ReplicateMsg* msg = req->add_ops();
   msg->mutable_id()->CopyFrom(id);
   msg->set_timestamp(id.index());
@@ -1181,8 +1194,7 @@ void RaftConsensusITest::AddOp(const OpId& id, ConsensusRequestPB* req) {
   WriteRequestPB* write_req = msg->mutable_write_request();
   CHECK_OK(SchemaToPB(schema_, write_req->mutable_schema()));
   write_req->set_tablet_id(tablet_id_);
-  int key = id.index() * 10000 + id.term();
-  AddTestRowToPB(RowOperationsPB::INSERT, schema_, key, id.term(),
+  AddTestRowToPB(op_type, schema_, key, id.term(),
                  id.ShortDebugString(), write_req->mutable_row_operations());
 }
 
@@ -1277,6 +1289,64 @@ TEST_F(RaftConsensusITest, TestLMPMismatchOnRestartedReplica) {
   EXPECT_EQ("2.2", OpIdToString(resp.status().last_received()));
 }
 
+// Test a scenario where a replica has pending operations with lock
+// dependencies on each other:
+//   2.2: UPSERT row 1
+//   2.3: UPSERT row 1
+//   2.4: UPSERT row 1
+// ...and a new leader tries to abort 2.4 in order to replace it with a new
+// operation. Because the operations have a lock dependency, operation 2.4
+// will be 'stuck' in the Prepare queue. This verifies that we can abort an
+// operation even if it's stuck in the queue.
+TEST_F(RaftConsensusITest, TestReplaceOperationStuckInPrepareQueue) {
+  TServerDetails* replica_ts;
+  NO_FATALS(SetupSingleReplicaTest(&replica_ts));
+
+  ConsensusServiceProxy* c_proxy = CHECK_NOTNULL(replica_ts->consensus_proxy.get());
+  ConsensusRequestPB req;
+  ConsensusResponsePB resp;
+  RpcController rpc;
+
+  req.set_tablet_id(tablet_id_);
+  req.set_dest_uuid(replica_ts->uuid());
+  req.set_caller_uuid("fake_caller");
+  req.set_caller_term(2);
+  req.set_all_replicated_index(0);
+  req.mutable_preceding_id()->CopyFrom(MakeOpId(1, 1));
+  AddOpWithTypeAndKey(MakeOpId(2, 2), RowOperationsPB::UPSERT, 1, &req);
+  AddOpWithTypeAndKey(MakeOpId(2, 3), RowOperationsPB::UPSERT, 1, &req);
+  AddOpWithTypeAndKey(MakeOpId(2, 4), RowOperationsPB::UPSERT, 1, &req);
+  req.set_committed_index(2);
+  rpc.Reset();
+  ASSERT_OK(c_proxy->UpdateConsensus(req, &resp, &rpc));
+  ASSERT_FALSE(resp.has_error()) << resp.DebugString();
+
+  // Replace operation 2.4 with 3.4, add 3.5 (upsert of a new key)
+  req.set_caller_term(3);
+  req.mutable_preceding_id()->CopyFrom(MakeOpId(2, 3));
+  req.clear_ops();
+  AddOpWithTypeAndKey(MakeOpId(3, 4), RowOperationsPB::UPSERT, 1, &req);
+  AddOpWithTypeAndKey(MakeOpId(3, 5), RowOperationsPB::UPSERT, 2, &req);
+  rpc.Reset();
+  rpc.set_timeout(MonoDelta::FromSeconds(5));
+  ASSERT_OK(c_proxy->UpdateConsensus(req, &resp, &rpc));
+  ASSERT_FALSE(resp.has_error()) << resp.DebugString();
+
+  // Commit all ops.
+  req.clear_ops();
+  req.set_committed_index(5);
+  req.mutable_preceding_id()->CopyFrom(MakeOpId(3, 5));
+  rpc.Reset();
+  ASSERT_OK(c_proxy->UpdateConsensus(req, &resp, &rpc));
+  ASSERT_FALSE(resp.has_error()) << resp.DebugString();
+
+  // Ensure we can read the data.
+  vector<string> results;
+  NO_FATALS(WaitForRowCount(replica_ts->tserver_proxy.get(), 2, &results));
+  ASSERT_EQ("(int32 key=1, int32 int_val=3, string string_val=\"term: 3 index: 4\")", results[0]);
+  ASSERT_EQ("(int32 key=2, int32 int_val=3, string string_val=\"term: 3 index: 5\")", results[1]);
+}
+
 // Regression test for KUDU-644:
 // Triggers some complicated scenarios on the replica involving aborting and
 // replacing transactions.