You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by mp...@apache.org on 2018/10/18 02:09:13 UTC

[3/3] kudu git commit: client: add timeout duration and scan attempts to scanner errors

client: add timeout duration and scan attempts to scanner errors

Previously, scanner errors could be confusing because certain types of
errors, like ServiceUnavailable, are retriable, and the RPC timeouts
mentioned in some error messages tend to be confusingly short because
after several retries the client-side deadline is looming. This patch
adds additional details to scanner timeout error messages, including
information about number of retries and the client-set timeout value.

An example of a new "enhanced" error message looks something like this
(from the new test that injects an error on scan):

Timed out: exceeded configured scan timeout of 1.000s: after 7 scan attempts: unable to retry before timeout: Remote error: Service unavailable: Injecting service unavailable status on Scan due to --scanner_inject_service_unavailable_on_continue_scan

Change-Id: I3a8f731f029132d0894355098d5804840f09e7c2
Reviewed-on: http://gerrit.cloudera.org:8080/11646
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Tested-by: Mike Percy <mp...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/8386ef9e
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/8386ef9e
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/8386ef9e

Branch: refs/heads/master
Commit: 8386ef9e2962ee5692c0e646fdd63f9826b3b47c
Parents: e61de49
Author: Mike Percy <mp...@apache.org>
Authored: Tue Oct 9 17:15:46 2018 -0700
Committer: Mike Percy <mp...@apache.org>
Committed: Thu Oct 18 02:08:50 2018 +0000

----------------------------------------------------------------------
 src/kudu/client/client-test.cc      | 20 ++++++++++++++++++++
 src/kudu/client/scanner-internal.cc | 27 +++++++++++++++++++--------
 src/kudu/client/scanner-internal.h  |  4 ++++
 src/kudu/tserver/tablet_service.cc  | 10 ++++++++++
 4 files changed, 53 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/8386ef9e/src/kudu/client/client-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/client-test.cc b/src/kudu/client/client-test.cc
index dc7e7b8..cc4c9ab 100644
--- a/src/kudu/client/client-test.cc
+++ b/src/kudu/client/client-test.cc
@@ -114,6 +114,7 @@ DECLARE_bool(fail_dns_resolution);
 DECLARE_bool(log_inject_latency);
 DECLARE_bool(master_support_connect_to_master_rpc);
 DECLARE_bool(rpc_trace_negotiation);
+DECLARE_bool(scanner_inject_service_unavailable_on_continue_scan);
 DECLARE_int32(flush_threshold_mb);
 DECLARE_int32(flush_threshold_secs);
 DECLARE_int32(heartbeat_interval_ms);
@@ -5161,6 +5162,25 @@ TEST_F(ClientTest, TestLastErrorEmbeddedInScanTimeoutStatus) {
   ASSERT_STR_CONTAINS(s.ToString(), "Illegal state: Tablet not RUNNING");
 }
 
+TEST_F(ClientTest, TestRetriesEmbeddedInScanTimeoutStatus) {
+  // For the random() calls that take place during scan retries.
+  SeedRandom();
+
+  NO_FATALS(InsertTestRows(client_table_.get(), FLAGS_test_scan_num_rows));
+
+  // Allow creating a scanner but fail all of the read calls.
+  FLAGS_scanner_inject_service_unavailable_on_continue_scan = true;
+
+  // The scanner will return a retriable error on read, and we will eventually
+  // observe a timeout.
+  KuduScanner scanner(client_table_.get());
+  ASSERT_OK(scanner.SetTimeoutMillis(1000));
+  Status s = scanner.Open();
+  ASSERT_TRUE(s.IsTimedOut()) << s.ToString();
+  ASSERT_STR_CONTAINS(s.ToString(), "exceeded configured scan timeout of");
+  ASSERT_STR_CONTAINS(s.ToString(), "scan attempts");
+}
+
 TEST_F(ClientTest, TestNoDefaultPartitioning) {
     unique_ptr<KuduTableCreator> table_creator(client_->NewTableCreator());
     Status s = table_creator->table_name("TestNoDefaultPartitioning").schema(&schema_).Create();

http://git-wip-us.apache.org/repos/asf/kudu/blob/8386ef9e/src/kudu/client/scanner-internal.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/scanner-internal.cc b/src/kudu/client/scanner-internal.cc
index 51cdca6..9ac1a86 100644
--- a/src/kudu/client/scanner-internal.cc
+++ b/src/kudu/client/scanner-internal.cc
@@ -81,6 +81,20 @@ KuduScanner::Data::Data(KuduTable* table)
 KuduScanner::Data::~Data() {
 }
 
+Status KuduScanner::Data::EnrichStatusMessage(Status s) const {
+  if (scan_attempts_ > 1) {
+    s = s.CloneAndPrepend(Substitute("after $0 scan attempts", scan_attempts_));
+  }
+  if (s.IsTimedOut()) {
+    s = s.CloneAndPrepend(Substitute("exceeded configured scan timeout of $0",
+                                     configuration_.timeout().ToString()));
+  }
+  if (!last_error_.ok()) {
+    s = s.CloneAndAppend(last_error_.ToString());
+  }
+  return s;
+}
+
 Status KuduScanner::Data::HandleError(const ScanRpcStatus& err,
                                       const MonoTime& deadline,
                                       set<string>* blacklist,
@@ -92,10 +106,10 @@ Status KuduScanner::Data::HandleError(const ScanRpcStatus& err,
   // If we timed out because of the overall deadline, we're done.
   // We didn't wait a full RPC timeout, though, so don't mark the tserver as failed.
   if (err.result == ScanRpcStatus::OVERALL_DEADLINE_EXCEEDED) {
-      LOG(INFO) << "Scan of tablet " << remote_->tablet_id() << " at "
-          << ts_->ToString() << " deadline expired.";
-      return last_error_.ok()
-          ? err.status : err.status.CloneAndAppend(last_error_.ToString());
+    LOG(INFO) << "Scan of tablet " << remote_->tablet_id() << " at "
+              << ts_->ToString() << " deadline exceeded after "
+              << configuration_.timeout().ToString();
+    return EnrichStatusMessage(err.status);
   }
 
   UpdateLastError(err.status);
@@ -172,10 +186,7 @@ Status KuduScanner::Data::HandleError(const ScanRpcStatus& err,
         KuduClient::Data::ComputeExponentialBackoff(scan_attempts_);
     MonoTime now = MonoTime::Now() + sleep;
     if (deadline < now) {
-      Status ret = Status::TimedOut("unable to retry before timeout",
-                                    err.status.ToString());
-      return last_error_.ok() ?
-          ret : ret.CloneAndAppend(last_error_.ToString());
+      return EnrichStatusMessage(Status::TimedOut("unable to retry before timeout"));
     }
     VLOG(1) << "Error scanning on server " << ts_->ToString() << ": "
             << err.status.ToString() << ". Will retry after "

http://git-wip-us.apache.org/repos/asf/kudu/blob/8386ef9e/src/kudu/client/scanner-internal.h
----------------------------------------------------------------------
diff --git a/src/kudu/client/scanner-internal.h b/src/kudu/client/scanner-internal.h
index 5db5bd9..0dddb40 100644
--- a/src/kudu/client/scanner-internal.h
+++ b/src/kudu/client/scanner-internal.h
@@ -278,6 +278,10 @@ class KuduScanner::Data {
                                 const MonoTime& overall_deadline,
                                 const MonoTime& rpc_deadline);
 
+  // Add additional details to the status message, such as number of retries,
+  // original cause of the error, etc. Returns a cloned object.
+  Status EnrichStatusMessage(Status s) const;
+
   void UpdateResourceMetrics();
 
   DISALLOW_COPY_AND_ASSIGN(Data);

http://git-wip-us.apache.org/repos/asf/kudu/blob/8386ef9e/src/kudu/tserver/tablet_service.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_service.cc b/src/kudu/tserver/tablet_service.cc
index f8b309e..da4002c 100644
--- a/src/kudu/tserver/tablet_service.cc
+++ b/src/kudu/tserver/tablet_service.cc
@@ -132,6 +132,11 @@ DEFINE_int32(scanner_inject_latency_on_each_batch_ms, 0,
              "Used for tests.");
 TAG_FLAG(scanner_inject_latency_on_each_batch_ms, unsafe);
 
+DEFINE_bool(scanner_inject_service_unavailable_on_continue_scan, false,
+           "If set, the scanner will return a ServiceUnavailable Status on "
+           "any Scan continuation RPC call. Used for tests.");
+TAG_FLAG(scanner_inject_service_unavailable_on_continue_scan, unsafe);
+
 DECLARE_bool(raft_prepare_replacement_before_eviction);
 DECLARE_int32(memory_limit_warn_threshold_percentage);
 DECLARE_int32(tablet_history_max_age_sec);
@@ -2052,6 +2057,11 @@ Status TabletServiceImpl::HandleContinueScanRequest(const ScanRequestPB* req,
     return s;
   }
 
+  if (PREDICT_FALSE(FLAGS_scanner_inject_service_unavailable_on_continue_scan)) {
+    return Status::ServiceUnavailable("Injecting service unavailable status on Scan due to "
+                                      "--scanner_inject_service_unavailable_on_continue_scan");
+  }
+
   // Set the row format flags on the ScanResultCollector.
   result_collector->set_row_format_flags(scanner->row_format_flags());