You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2017/11/16 23:49:43 UTC

kudu git commit: Deflake tablet_replica-test anchor and transaction count assertions

Repository: kudu
Updated Branches:
  refs/heads/master 1e4db3148 -> 338cf61cb


Deflake tablet_replica-test anchor and transaction count assertions

Various assertions in this test failed to account for the fact that,
when a transaction finishes, its callback is triggered before it
unregisters itself from the transaction tracker. This means that, if we
wait for the callback and then immediately consult the transaction
tracker (or log anchor registry) we may still see the transaction,
causing the assertion to fail.

The fix here is to simply wrap such assertions with ASSERT_EVENTUALLY.

Prior to the test change, if I added a 10ms sleep in
TransactionDriver::Finalize() between callling the callback and
releasing the transaction from the tracker, I'd get consistent failures.
Now I can add such a sleep and the test still passes.

Change-Id: Icee241077558a16b8a5076ab0c059362e8e6f035
Reviewed-on: http://gerrit.cloudera.org:8080/8564
Reviewed-by: Alexey Serbin <as...@cloudera.com>
Reviewed-by: Mike Percy <mp...@apache.org>
Tested-by: Todd Lipcon <to...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/338cf61c
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/338cf61c
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/338cf61c

Branch: refs/heads/master
Commit: 338cf61cb7ec4357e91b9f55d9fe4ed9b3ee641c
Parents: 1e4db31
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Nov 15 20:42:39 2017 -0800
Committer: Todd Lipcon <to...@apache.org>
Committed: Thu Nov 16 22:44:22 2017 +0000

----------------------------------------------------------------------
 src/kudu/tablet/tablet_replica-test.cc | 31 +++++++++++++++++++----------
 1 file changed, 20 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/338cf61c/src/kudu/tablet/tablet_replica-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_replica-test.cc b/src/kudu/tablet/tablet_replica-test.cc
index f6b2143..bab0972 100644
--- a/src/kudu/tablet/tablet_replica-test.cc
+++ b/src/kudu/tablet/tablet_replica-test.cc
@@ -71,6 +71,7 @@
 #include "kudu/util/pb_util.h"
 #include "kudu/util/status.h"
 #include "kudu/util/test_macros.h"
+#include "kudu/util/test_util.h"
 #include "kudu/util/threadpool.h"
 
 METRIC_DECLARE_entity(tablet);
@@ -265,9 +266,14 @@ class TabletReplicaTest : public KuduTabletTest {
     return Status::OK();
   }
 
+  // Assert that there are no log anchors held on the tablet replica.
+  //
+  // NOTE: when a transaction finishes and notifies the completion callback, it still is
+  // registered with the transaction tracker for a very short time before being
+  // destructed. So, this should always be called with an ASSERT_EVENTUALLY wrapper.
   void AssertNoLogAnchors() {
     // Make sure that there are no registered anchors in the registry
-    CHECK_EQ(0, tablet_replica_->log_anchor_registry()->GetAnchorCountForTests());
+    ASSERT_EQ(0, tablet_replica_->log_anchor_registry()->GetAnchorCountForTests());
   }
 
   // Assert that the Log GC() anchor is earlier than the latest OpId in the Log.
@@ -330,7 +336,7 @@ TEST_F(TabletReplicaTest, TestMRSAnchorPreventsLogGC) {
   Log* log = tablet_replica_->log_.get();
   int32_t num_gced;
 
-  AssertNoLogAnchors();
+  ASSERT_EVENTUALLY([&]{ AssertNoLogAnchors(); });
 
   log::SegmentSequence segments;
   ASSERT_OK(log->reader()->GetSegmentsSnapshot(&segments));
@@ -350,7 +356,7 @@ TEST_F(TabletReplicaTest, TestMRSAnchorPreventsLogGC) {
 
   // Flush MRS as needed to ensure that we don't have OpId anchors in the MRS.
   tablet_replica_->tablet()->Flush();
-  AssertNoLogAnchors();
+  ASSERT_EVENTUALLY([&]{ AssertNoLogAnchors(); });
 
   // The first two segments should be deleted.
   // The last is anchored due to the commit in the last segment being the last
@@ -371,7 +377,7 @@ TEST_F(TabletReplicaTest, TestDMSAnchorPreventsLogGC) {
   shared_ptr<RaftConsensus> consensus = tablet_replica_->shared_consensus();
   int32_t num_gced;
 
-  AssertNoLogAnchors();
+  ASSERT_EVENTUALLY([&]{ AssertNoLogAnchors(); });
 
   log::SegmentSequence segments;
   ASSERT_OK(log->reader()->GetSegmentsSnapshot(&segments));
@@ -383,6 +389,7 @@ TEST_F(TabletReplicaTest, TestDMSAnchorPreventsLogGC) {
 
   // Flush MRS & GC log so the next mutation goes into a DMS.
   ASSERT_OK(tablet_replica_->tablet()->Flush());
+  ASSERT_EVENTUALLY([&]{ AssertNoLogAnchors(); });
   log::RetentionIndexes retention = tablet_replica_->GetRetentionIndexes();
   ASSERT_OK(log->GC(retention, &num_gced));
   // We will only GC 1, and have 1 left because the earliest needed OpId falls
@@ -390,7 +397,6 @@ TEST_F(TabletReplicaTest, TestDMSAnchorPreventsLogGC) {
   ASSERT_EQ(1, num_gced);
   ASSERT_OK(log->reader()->GetSegmentsSnapshot(&segments));
   ASSERT_EQ(2, segments.size());
-  AssertNoLogAnchors();
 
   boost::optional<OpId> id = consensus->GetLastOpId(consensus::RECEIVED_OPID);
   ASSERT_NE(boost::none, id);
@@ -429,7 +435,7 @@ TEST_F(TabletReplicaTest, TestDMSAnchorPreventsLogGC) {
   tablet_replica_->tablet()->FlushBiggestDMS();
 
   // Verify no anchors after Flush().
-  AssertNoLogAnchors();
+  ASSERT_EVENTUALLY([&]{ AssertNoLogAnchors(); });
 
   // We should only hang onto one segment due to no anchors.
   // The last log OpId is the commit in the last segment, so it only anchors
@@ -450,7 +456,7 @@ TEST_F(TabletReplicaTest, TestActiveTransactionPreventsLogGC) {
   Log* log = tablet_replica_->log_.get();
   int32_t num_gced;
 
-  AssertNoLogAnchors();
+  ASSERT_EVENTUALLY([&]{ AssertNoLogAnchors(); });
 
   log::SegmentSequence segments;
   ASSERT_OK(log->reader()->GetSegmentsSnapshot(&segments));
@@ -465,7 +471,7 @@ TEST_F(TabletReplicaTest, TestActiveTransactionPreventsLogGC) {
   tablet_replica_->tablet()->Flush();
 
   // Verify no anchors after Flush().
-  AssertNoLogAnchors();
+  ASSERT_EVENTUALLY([&]{ AssertNoLogAnchors(); });
 
   // Now create a long-lived Transaction that hangs during Apply().
   // Allow other transactions to go through. Logs should be populated, but the
@@ -523,8 +529,11 @@ TEST_F(TabletReplicaTest, TestActiveTransactionPreventsLogGC) {
   ASSERT_EQ(5, segments.size());
   ASSERT_EQ(1, tablet_replica_->log_anchor_registry()->GetAnchorCountForTests());
   tablet_replica_->tablet()->FlushBiggestDMS();
-  ASSERT_EQ(0, tablet_replica_->log_anchor_registry()->GetAnchorCountForTests());
-  ASSERT_EQ(1, tablet_replica_->txn_tracker_.GetNumPendingForTests());
+
+  ASSERT_EVENTUALLY([&]{
+      AssertNoLogAnchors();
+      ASSERT_EQ(1, tablet_replica_->txn_tracker_.GetNumPendingForTests());
+    });
 
   NO_FATALS(AssertLogAnchorEarlierThanLogLatest());
 
@@ -543,7 +552,7 @@ TEST_F(TabletReplicaTest, TestActiveTransactionPreventsLogGC) {
   tablet_replica_->txn_tracker_.WaitForAllToFinish();
   ASSERT_EQ(0, tablet_replica_->txn_tracker_.GetNumPendingForTests());
   tablet_replica_->tablet()->FlushBiggestDMS();
-  AssertNoLogAnchors();
+  ASSERT_EVENTUALLY([&]{ AssertNoLogAnchors(); });
 
   // All should be deleted except the two last segments.
   retention = tablet_replica_->GetRetentionIndexes();