You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by aw...@apache.org on 2020/01/08 04:36:07 UTC

[kudu] branch master updated (4619473 -> c033425)

This is an automated email from the ASF dual-hosted git repository.

awong pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git.


    from 4619473  KUDU-3011 p3: mechanism to quiesce scans
     new da55638  consensus: throttle warnings when quiescing server
     new c033425  KUDU-3011 p4: follow-up to 4619473

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/kudu/consensus/raft_consensus.cc                       | 12 ++++++------
 .../integration-tests/tablet_server_quiescing-itest.cc     | 14 +++++++++++++-
 src/kudu/integration-tests/test_workload.h                 |  2 +-
 src/kudu/tserver/tablet_service.cc                         |  8 ++++----
 src/kudu/util/logging.h                                    |  7 +++++++
 5 files changed, 31 insertions(+), 12 deletions(-)


[kudu] 01/02: consensus: throttle warnings when quiescing server

Posted by aw...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit da55638394d78830b3440b2272f8161afae08ca0
Author: Andrew Wong <aw...@apache.org>
AuthorDate: Tue Jan 7 18:07:26 2020 -0800

    consensus: throttle warnings when quiescing server
    
    When an election fails to start because the server is quiescing, it will
    log a warning. When there are many replicas on a server, this logging
    can become very verbose.
    
    This throttles such logs to happen at most once per ten seconds. Here's
    a snippet of the logs from TestQuiescingServerDoesntTriggerElections:
    
    Before:
    W0107 17:55:46.943423 55670 raft_consensus.cc:655] T d1bc343b18e348ada0a4fadb835d8cc1 P f0fd8e85d7a843e6862dfd846ca47351: failed to trigger leader election: Illegal state: leader elections are disabled
    W0107 17:55:46.947376 55670 raft_consensus.cc:655] T 4fb4aec4dbb94dc9b2af086b761d2632 P f0fd8e85d7a843e6862dfd846ca47351: failed to trigger leader election: Illegal state: leader elections are disabled
    W0107 17:55:46.949579 55670 raft_consensus.cc:655] T 49bf5837ae0c4d95bf1a176a856c9f37 P f0fd8e85d7a843e6862dfd846ca47351: failed to trigger leader election: Illegal state: leader elections are disabled
    W0107 17:55:46.992861 55670 raft_consensus.cc:655] T 842d307a87f846efa00bbf0a537fe812 P f0fd8e85d7a843e6862dfd846ca47351: failed to trigger leader election: Illegal state: leader elections are disabled
    
    After:
    W0107 19:16:47.518998 87519 raft_consensus.cc:655] T 1e0aed8879c643d6a8684f6da729a1a3 P 14907cae807b4560b4b8417ba5f24393: failed to trigger leader election: Illegal state: leader elections are disabled
    
    (in After, the test didn't last 10 seconds)
    
    While it would be nice to retain logs for every replica for grep-ability, I
    think the improved readability of the logs justifies the loss.
    
    Change-Id: I01e3c88c2bd4c53d095b9b5a061439231dcfc9e9
    Reviewed-on: http://gerrit.cloudera.org:8080/14989
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
    Tested-by: Andrew Wong <aw...@cloudera.com>
---
 src/kudu/consensus/raft_consensus.cc | 12 ++++++------
 src/kudu/util/logging.h              |  7 +++++++
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc
index 204c79f..8f83ed8 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc
@@ -379,8 +379,8 @@ Status RaftConsensus::Start(const ConsensusBootstrapInfo& info,
 
   if (IsSingleVoterConfig() && FLAGS_enable_leader_failure_detection) {
     LOG_WITH_PREFIX(INFO) << "Only one voter in the Raft config. Triggering election immediately";
-    WARN_NOT_OK(StartElection(NORMAL_ELECTION, INITIAL_SINGLE_NODE_ELECTION),
-                "Couldn't start leader election");
+    WARN_NOT_OK_EVERY_N_SECS(StartElection(NORMAL_ELECTION, INITIAL_SINGLE_NODE_ELECTION),
+                             "Couldn't start leader election", 10);
   }
 
   // Report become visible to the Master.
@@ -650,9 +650,9 @@ void RaftConsensus::ReportFailureDetectedTask() {
   std::unique_lock<simple_spinlock> try_lock(failure_detector_election_lock_,
                                              std::try_to_lock);
   if (try_lock.owns_lock()) {
-    WARN_NOT_OK(StartElection(FLAGS_raft_enable_pre_election ?
+    WARN_NOT_OK_EVERY_N_SECS(StartElection(FLAGS_raft_enable_pre_election ?
         PRE_ELECTION : NORMAL_ELECTION, ELECTION_TIMEOUT_EXPIRED),
-                LogPrefixThreadSafe() + "failed to trigger leader election");
+                             LogPrefixThreadSafe() + "failed to trigger leader election", 10);
   }
 }
 
@@ -2698,8 +2698,8 @@ void RaftConsensus::DoElectionCallback(ElectionReason reason, const ElectionResu
   if (was_pre_election) {
     // We just won the pre-election. So, we need to call a real election.
     lock.unlock();
-    WARN_NOT_OK(StartElection(NORMAL_ELECTION, reason),
-                "Couldn't start leader election after successful pre-election");
+    WARN_NOT_OK_EVERY_N_SECS(StartElection(NORMAL_ELECTION, reason),
+                             "Couldn't start leader election after successful pre-election", 10);
   } else {
     // We won a real election. Convert role to LEADER.
     SetLeaderUuidUnlocked(peer_uuid());
diff --git a/src/kudu/util/logging.h b/src/kudu/util/logging.h
index 87012b1..0d3492a 100644
--- a/src/kudu/util/logging.h
+++ b/src/kudu/util/logging.h
@@ -164,6 +164,13 @@ class ScopedDisableRedaction {
   static logging::LogThrottler LOG_THROTTLER;  \
   KLOG_EVERY_N_SECS_THROTTLER(severity, n_secs, LOG_THROTTLER, "no-tag")
 
+#define WARN_NOT_OK_EVERY_N_SECS(to_call, warning_prefix, n_secs) do {                 \
+    const ::kudu::Status& _s = (to_call);                                              \
+    if (PREDICT_FALSE(!_s.ok())) {                                                     \
+      KLOG_EVERY_N_SECS(WARNING, n_secs) << (warning_prefix) << ": " << _s.ToString()  \
+                                         << THROTTLE_MSG;                              \
+    }                                                                                  \
+  } while (0)
 
 namespace kudu {
 enum PRIVATE_ThrottleMsg {THROTTLE_MSG};


[kudu] 02/02: KUDU-3011 p4: follow-up to 4619473

Posted by aw...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit c033425b3d579591ebb15c75de1f193604b8e95b
Author: Andrew Wong <aw...@apache.org>
AuthorDate: Tue Jan 7 19:09:51 2020 -0800

    KUDU-3011 p4: follow-up to 4619473
    
    This addresses feedback on commit 4619473 left after merging.
    
    Change-Id: I6d30b28fec9e3c34060f3ad45f6e34a33d4e9a23
    Reviewed-on: http://gerrit.cloudera.org:8080/14990
    Reviewed-by: Alexey Serbin <as...@cloudera.com>
    Tested-by: Kudu Jenkins
---
 .../integration-tests/tablet_server_quiescing-itest.cc     | 14 +++++++++++++-
 src/kudu/integration-tests/test_workload.h                 |  2 +-
 src/kudu/tserver/tablet_service.cc                         |  8 ++++----
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/kudu/integration-tests/tablet_server_quiescing-itest.cc b/src/kudu/integration-tests/tablet_server_quiescing-itest.cc
index 5c18a71..bf4fcec 100644
--- a/src/kudu/integration-tests/tablet_server_quiescing-itest.cc
+++ b/src/kudu/integration-tests/tablet_server_quiescing-itest.cc
@@ -51,6 +51,7 @@ DECLARE_bool(catalog_manager_wait_for_new_tablets_to_elect_leader);
 DECLARE_double(leader_failure_max_missed_heartbeat_periods);
 DECLARE_int32(consensus_inject_latency_ms_in_notifications);
 DECLARE_int32(scanner_default_batch_size_bytes);
+DECLARE_int32(scanner_ttl_ms);
 DECLARE_int32(raft_heartbeat_interval_ms);
 
 using kudu::client::KuduClient;
@@ -374,12 +375,23 @@ TEST_P(TServerQuiescingParamITest, TestScansRetry) {
   // Now stop quiescing one of the servers. Our scans should succeed. Set a
   // small batch size so our scanner remains active.
   FLAGS_scanner_default_batch_size_bytes = 1;
+  // Make our scanner expire really quickly so we can test that we can keep the
+  // scanner alive even while the tserver is quiescing.
+  FLAGS_scanner_ttl_ms = 1000;
   auto* ts = cluster_->mini_tablet_server(0)->server();
   *ts->mutable_quiescing() = false;
   KuduScanBatch batch;
   ASSERT_OK(scanner.Open());
   ASSERT_OK(scanner.NextBatch(&batch));
-  ASSERT_EQ(1, ts->scanner_manager()->CountActiveScanners());
+
+  // Keep the scanner alive, even as we're quiescing.
+  const auto past_original_expiration =
+      MonoTime::Now() + MonoDelta::FromMilliseconds(2 * FLAGS_scanner_ttl_ms);
+  while (MonoTime::Now() < past_original_expiration) {
+    ASSERT_EQ(1, ts->scanner_manager()->CountActiveScanners());
+    ASSERT_OK(scanner.KeepAlive());
+    SleepFor(MonoDelta::FromMilliseconds(10));
+  }
 }
 
 INSTANTIATE_TEST_CASE_P(NumReplicas, TServerQuiescingParamITest, ::testing::Values(1, 3));
diff --git a/src/kudu/integration-tests/test_workload.h b/src/kudu/integration-tests/test_workload.h
index 1f3eaa9..5afa625 100644
--- a/src/kudu/integration-tests/test_workload.h
+++ b/src/kudu/integration-tests/test_workload.h
@@ -128,7 +128,7 @@ class TestWorkload {
   }
 
   // Set whether we should attempt to verify the number of rows when scanning.
-  // This sort of error may be indicative of a stale read.
+  // An incorrect number of rows may be indicative of a stale read.
   void set_verify_num_rows(bool should_verify) {
     verify_num_rows_ = should_verify;
   }
diff --git a/src/kudu/tserver/tablet_service.cc b/src/kudu/tserver/tablet_service.cc
index 6b49af3..93c6d9c 100644
--- a/src/kudu/tserver/tablet_service.cc
+++ b/src/kudu/tserver/tablet_service.cc
@@ -359,8 +359,8 @@ bool GetConsensusOrRespond(const scoped_refptr<TabletReplica>& replica,
 }
 
 template<class RespClass>
-bool CheckTabletServerQuiescingOrRespond(const TabletServer* server, RespClass* resp,
-                                         rpc::RpcContext* context) {
+bool CheckTabletServerNotQuiescingOrRespond(const TabletServer* server, RespClass* resp,
+                                            rpc::RpcContext* context) {
   if (PREDICT_FALSE(server->quiescing())) {
     Status s = Status::ServiceUnavailable("Tablet server is quiescing");
     SetupErrorAndRespond(resp->mutable_error(), s,
@@ -1708,7 +1708,7 @@ void TabletServiceImpl::Scan(const ScanRequestPB* req,
   bool has_more_results = false;
   TabletServerErrorPB::Code error_code = TabletServerErrorPB::UNKNOWN_ERROR;
   if (req->has_new_scan_request()) {
-    if (!CheckTabletServerQuiescingOrRespond(server_, resp, context)) {
+    if (!CheckTabletServerNotQuiescingOrRespond(server_, resp, context)) {
       return;
     }
     const NewScanRequestPB& scan_pb = req->new_scan_request();
@@ -2022,7 +2022,7 @@ void TabletServiceImpl::Checksum(const ChecksumRequestPB* req,
     }
   }
   if (req->has_new_request()) {
-    if (!CheckTabletServerQuiescingOrRespond(server_, resp, context)) {
+    if (!CheckTabletServerNotQuiescingOrRespond(server_, resp, context)) {
       return;
     }
     const NewScanRequestPB& new_req = req->new_request();