You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2020/12/18 06:52:33 UTC

[kudu] branch master updated: KUDU-2612: two more scenarios for txn keepalive in client

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new ac75ea8  KUDU-2612: two more scenarios for txn keepalive in client
ac75ea8 is described below

commit ac75ea826f3a6f04bc31d0d487ae924917e64f8c
Author: Alexey Serbin <al...@apache.org>
AuthorDate: Wed Dec 16 22:29:15 2020 -0800

    KUDU-2612: two more scenarios for txn keepalive in client
    
    This patch adds two more test scenarios related to txn keepalive
    messages automatically sent by Kudu C++ client.  These are to verify
    the corresponding functionality of the client when TxnManager instances
    are not available for short and long time intervals, where 'short' and
    'long' are relative to the txn keepalive heartbeat timeout interval.
    
    In addition, I sneaked in one small fix to avoid TSAN warnings
    when running the newly introduced tests: the removed CHECK() in
    Master::WaitForTxnManagerInit() isn't crucial.
    
    Change-Id: I42988996f1ddb1cd456d289ea7e15586bd7e3dc7
    Reviewed-on: http://gerrit.cloudera.org:8080/16886
    Tested-by: Kudu Jenkins
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
---
 src/kudu/client/client-test.cc | 95 ++++++++++++++++++++++++++++++++++++++++++
 src/kudu/master/master.cc      |  1 -
 2 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/src/kudu/client/client-test.cc b/src/kudu/client/client-test.cc
index d1dca6d..60d9f1e 100644
--- a/src/kudu/client/client-test.cc
+++ b/src/kudu/client/client-test.cc
@@ -7598,6 +7598,101 @@ TEST_F(ClientTest, TxnKeepAlive) {
   }
 }
 
+// A scenario to explicitly show that long-running transactions are not aborted
+// if Kudu masters are not available for periods of time shorter than the txn
+// keepalive interval.
+TEST_F(ClientTest, TxnKeepAliveAndUnavailableTxnManagerShortTime) {
+  SKIP_IF_SLOW_NOT_ALLOWED();
+
+#if defined(THREAD_SANITIZER) || defined(ADDRESS_SANITIZER)
+  // ASAN and TSAN use longer intervals to avoid flakiness: it takes some
+  // time to restart masters, and that adds up to the unavailability interval.
+  constexpr const auto kUnavailabilityIntervalMs = 5000;
+#else
+  constexpr const auto kUnavailabilityIntervalMs = 1000;
+#endif
+
+  // To avoid flakiness, set the txn keepalive interval longer than it is in
+  // other scenarios (NOTE: it's still much shorter than it's default value
+  // to be used in real life).
+  //
+  // The cluster is restarted to avoid TSAN warnings on the access to the
+  // flag values by the stale txn monitoring thread and the assignments below.
+  cluster_->Shutdown();
+  FLAGS_txn_keepalive_interval_ms = 3 * kUnavailabilityIntervalMs;
+  FLAGS_txn_staleness_tracker_interval_ms = kUnavailabilityIntervalMs / 8;
+  ASSERT_OK(cluster_->Start());
+
+  shared_ptr<KuduTransaction> txn;
+  ASSERT_OK(client_->NewTransaction(&txn));
+
+  // Shutdown masters -- they host TxnManager instances which proxy txn-related
+  // RPC calls from clients to corresponding TxnStatusManager.
+  cluster_->ShutdownNodes(cluster::ClusterNodes::MASTERS_ONLY);
+
+  // An attempt to commit a transaction should fail right away (i.e. no retries)
+  // due to unreachable masters.
+  {
+    auto s = txn->Commit(false /* wait */);
+    ASSERT_TRUE(s.IsNetworkError()) << s.ToString();
+    ASSERT_STR_CONTAINS(s.ToString(), "connection negotiation failed");
+    ASSERT_STR_CONTAINS(s.ToString(), "Connection refused");
+  }
+
+  // Wait for some time, but not too long to allow the system to get txn
+  // keepalive heartbeat before the timeout.
+  SleepFor(MonoDelta::FromMilliseconds(kUnavailabilityIntervalMs));
+
+  // Start masters back.
+  for (auto idx = 0; idx < cluster_->num_masters(); ++idx) {
+    ASSERT_OK(cluster_->mini_master(idx)->Restart());
+  }
+
+  // Now, when masters are back and running, the client should be able
+  // to commit the transaction. It should not be automatically aborted.
+  ASSERT_OK(txn->Commit(false /* wait */));
+  ASSERT_OK(FinalizeCommitTransaction(txn));
+}
+
+// A scenario to explicitly show that long-running transactions are
+// aborted if Kudu masters are not available for time period longer than
+// the txn keepalive interval.
+TEST_F(ClientTest, TxnKeepAliveAndUnavailableTxnManagerLongTime) {
+  shared_ptr<KuduTransaction> txn;
+  ASSERT_OK(client_->NewTransaction(&txn));
+
+  // Shutdown masters -- they host TxnManager instances which proxy txn-related
+  // RPC calls from clients to corresponding TxnStatusManager.
+  cluster_->ShutdownNodes(cluster::ClusterNodes::MASTERS_ONLY);
+
+  // Wait for some time to allow the system to detect stale transactions.
+  SleepFor(MonoDelta::FromMilliseconds(2 * FLAGS_txn_keepalive_interval_ms));
+
+  // An attempt to commit a transaction should fail due to unreachable masters.
+  {
+    auto s = txn->Commit(false /* wait */);
+    ASSERT_TRUE(s.IsNetworkError()) << s.ToString();
+    ASSERT_STR_CONTAINS(s.ToString(), "connection negotiation failed");
+    ASSERT_STR_CONTAINS(s.ToString(), "Connection refused");
+  }
+
+  // Start masters back.
+  for (auto idx = 0; idx < cluster_->num_masters(); ++idx) {
+    ASSERT_OK(cluster_->mini_master(idx)->Restart());
+  }
+
+  // Now, when masters are back and running, the client should get the
+  // Status::IllegalState() status back when trying to commit the transaction
+  // which has been automatically aborted by the system due to not receiving
+  // any txn keepalive messages for longer than prescribed by the
+  // --txn_keepalive_interval_ms flag.
+  {
+    auto s = txn->Commit(false /* wait */);
+    ASSERT_TRUE(s.IsIllegalState()) << s.ToString();
+    ASSERT_STR_CONTAINS(s.ToString(), "not open: state: ABORTED");
+  }
+}
+
 // Client test that assigns locations to clients and tablet servers.
 // For now, assigns a uniform location to all clients and tablet servers.
 class ClientWithLocationTest : public ClientTest {
diff --git a/src/kudu/master/master.cc b/src/kudu/master/master.cc
index 7446b77..4f693d9 100644
--- a/src/kudu/master/master.cc
+++ b/src/kudu/master/master.cc
@@ -336,7 +336,6 @@ Status Master::InitTxnManager() {
 }
 
 Status Master::WaitForTxnManagerInit(const MonoDelta& timeout) const {
-  CHECK_EQ(state_, kRunning);
   if (timeout.Initialized()) {
     const Status* s = txn_manager_init_status_.WaitFor(timeout);
     if (!s) {