You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2016/04/02 03:27:26 UTC
incubator-kudu git commit: KUDU-1387. Fix a case where the scanner
tight-loops and then sleeps too long
Repository: incubator-kudu
Updated Branches:
refs/heads/master 1f014124d -> 563313d15
KUDU-1387. Fix a case where the scanner tight-loops and then sleeps too long
This prevents the following issue:
- the leader is down and election has not yet been triggered
- the scanner tries to hit the leader, and gets 'connection refused', and thus
marks it as down, then goes back to the scanner retry loop
- in the tablet lookup path, RemoteTablet::HasLeader() returns false because
the leader is known to be down. This causes the client to fetch new locations.
- fetching new locations marks the server as up again. This logic is dubious,
but will be more complicated to address.
- because the server is now seen as "up" again, we just retry on the same server.
The patch fixes the scanner code so that, when a tablet server is down, it is added
to the scan's blacklist in addition to marking the server as down client-wide.
This makes the scanner code realize that all eligible servers are blacklisted and
trigger a sleep and backoff before retrying.
Without this patch, linked_list-test timed out a few percent of the time
in RELEASE builds. With the patch, it passed 200/200 times. I also noticed
that an existing test in client-test was triggering the tight retries, but
didn't have any assertion to detect the problematic number of RPCs.
Change-Id: I3cb3afa81cd6f75756c328b6ffe23a385f4b172d
Reviewed-on: http://gerrit.cloudera.org:8080/2699
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Tested-by: Kudu Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/563313d1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/563313d1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/563313d1
Branch: refs/heads/master
Commit: 563313d15e922db4255736ed1423bb418bbcd6fd
Parents: 1f01412
Author: Todd Lipcon <to...@apache.org>
Authored: Fri Apr 1 01:10:14 2016 -0700
Committer: Todd Lipcon <to...@apache.org>
Committed: Sat Apr 2 01:04:59 2016 +0000
----------------------------------------------------------------------
src/kudu/client/client-test.cc | 27 +++++++++++++++++++++++++++
src/kudu/client/scanner-internal.cc | 16 +++++++---------
2 files changed, 34 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/563313d1/src/kudu/client/client-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/client-test.cc b/src/kudu/client/client-test.cc
index a49aaf8..f5ae23e 100644
--- a/src/kudu/client/client-test.cc
+++ b/src/kudu/client/client-test.cc
@@ -72,6 +72,9 @@ DECLARE_int32(scanner_ttl_ms);
DEFINE_int32(test_scan_num_rows, 1000, "Number of rows to insert and scan");
METRIC_DECLARE_counter(rpcs_queue_overflow);
+METRIC_DECLARE_histogram(handler_latency_kudu_master_MasterService_GetMasterRegistration);
+METRIC_DECLARE_histogram(handler_latency_kudu_master_MasterService_GetTableLocations);
+METRIC_DECLARE_histogram(handler_latency_kudu_master_MasterService_GetTabletLocations);
using std::string;
using std::set;
@@ -178,6 +181,19 @@ class ClientTest : public KuduTest {
}
}
+ // Return the number of lookup-related RPCs which have been serviced by the master.
+ int CountMasterLookupRPCs() const {
+ auto ent = cluster_->mini_master()->master()->metric_entity();
+ int ret = 0;
+ ret += METRIC_handler_latency_kudu_master_MasterService_GetMasterRegistration
+ .Instantiate(ent)->TotalCount();
+ ret += METRIC_handler_latency_kudu_master_MasterService_GetTableLocations
+ .Instantiate(ent)->TotalCount();
+ ret += METRIC_handler_latency_kudu_master_MasterService_GetTabletLocations
+ .Instantiate(ent)->TotalCount();
+ return ret;
+ }
+
// Inserts 'num_rows' test rows using 'client'
void InsertTestRows(KuduClient* client, KuduTable* table, int num_rows, int first_row = 0) {
shared_ptr<KuduSession> session = client->NewSession();
@@ -2130,10 +2146,21 @@ TEST_F(ClientTest, TestReplicatedMultiTabletTableFailover) {
// We wait until we fail over to the new leader(s).
int tries = 0;
for (;;) {
+ int master_rpcs_before = CountMasterLookupRPCs();
tries++;
int num_rows = CountRowsFromClient(table.get(),
KuduClient::LEADER_ONLY,
kNoBound, kNoBound);
+ int master_rpcs = CountMasterLookupRPCs() - master_rpcs_before;
+
+ // Regression test for KUDU-1387: we should not have any tight loops
+ // hitting the master during the time when a client is awaiting leader
+ // election. However, we do expect a reasonable number of lookup retries
+ // as the client will retry rapidly at first and then back off. 20 is
+ // enough to avoid test flakiness. Before fixing the above-mentioned bug,
+ // we would see several hundred lookups here.
+ ASSERT_LT(master_rpcs, 20);
+
if (num_rows == kNumRowsToWrite) {
LOG(INFO) << "Found expected number of rows: " << num_rows;
break;
http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/563313d1/src/kudu/client/scanner-internal.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/scanner-internal.cc b/src/kudu/client/scanner-internal.cc
index 3a3fc11..0863e47 100644
--- a/src/kudu/client/scanner-internal.cc
+++ b/src/kudu/client/scanner-internal.cc
@@ -185,6 +185,7 @@ Status KuduScanner::Data::CanBeRetried(const bool isNewScan,
// All other types of network errors are retriable, and also indicate the tserver is failed.
UpdateLastError(rpc_status);
table_->client()->data_->meta_cache_->MarkTSFailed(ts_, rpc_status);
+ blacklist->insert(ts_->permanent_uuid());
}
}
@@ -224,15 +225,6 @@ Status KuduScanner::Data::CanBeRetried(const bool isNewScan,
VLOG(1) << "Got error code " << tserver::TabletServerErrorPB::Code_Name(error.code())
<< ": temporarily blacklisting node " << ts_->permanent_uuid();
blacklist->insert(ts_->permanent_uuid());
- // We've blacklisted all the live candidate tservers.
- // Do a short random sleep, clear the temp blacklist, then do another round of retries.
- if (!candidates.empty() && candidates.size() == blacklist->size()) {
- MonoDelta sleep_delta = MonoDelta::FromMilliseconds((random() % 5000) + 1000);
- LOG(INFO) << "All live candidate nodes are unavailable because of transient errors."
- << " Sleeping for " << sleep_delta.ToMilliseconds() << " ms before trying again.";
- SleepFor(sleep_delta);
- blacklist->clear();
- }
break;
case tserver::TabletServerErrorPB::TABLET_NOT_FOUND: {
// There was either a tablet configuration change or the table was
@@ -354,7 +346,13 @@ Status KuduScanner::Data::OpenTablet(const string& partition_key,
// soon have one.
if (lookup_status.IsServiceUnavailable() &&
MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) {
+
+ // ServiceUnavailable means that we have already blacklisted all of the candidate
+ // tablet servers. So, we clear the list so that we will cycle through them all
+ // another time.
+ blacklist->clear();
int sleep_ms = attempt * 100;
+ // TODO: should ensure that sleep_ms does not pass the provided deadline.
VLOG(1) << "Tablet " << remote_->tablet_id() << " current unavailable: "
<< lookup_status.ToString() << ". Sleeping for " << sleep_ms << "ms "
<< "and retrying...";