You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2016/02/02 22:11:22 UTC

incubator-kudu git commit: KUDU-1272: better error messages when client deadlines expire

Repository: incubator-kudu
Updated Branches:
  refs/heads/master 0f609bd39 -> ab2c673d9


KUDU-1272: better error messages when client deadlines expire

There are two problems here:
1. When an RpcController is configured with a deadline in the past, the
   subsequent RPC status message will include a negative timeout.
2. If a deadline expires during an RPC, we don't do a great job of inserting
   the operation name into the RPC status message.

To address #1, we need to stop using deadlines from the past as short
circuits to instantly time out operations. Barring the removal of the
deadline from RpcController's generated status message, we must explicitly
check for timeouts before sending RPCs.

For #2, I sprinkled some CloneAndAppend calls here and there to improve
status messages. It's pretty hard to reason about the scanner CanBeRetried()
method, so I left it alone. Spaghetti code abounds.

Change-Id: Iee294fd5d348fee37324d1bbc4119c20fc0299c2
Reviewed-on: http://gerrit.cloudera.org:8080/1986
Tested-by: Adar Dembo <ad...@cloudera.com>
Reviewed-by: Jean-Daniel Cryans


Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/ab2c673d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/ab2c673d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/ab2c673d

Branch: refs/heads/master
Commit: ab2c673d9928a3e035ba6b15a9853587f56cc3af
Parents: 0f609bd
Author: Adar Dembo <ad...@cloudera.com>
Authored: Mon Feb 1 18:11:07 2016 -0800
Committer: Adar Dembo <ad...@cloudera.com>
Committed: Tue Feb 2 21:03:38 2016 +0000

----------------------------------------------------------------------
 src/kudu/client/client-internal.cc  | 66 ++++++++++++++++++++++----------
 src/kudu/client/client-internal.h   |  4 +-
 src/kudu/client/client-test.cc      |  2 +-
 src/kudu/client/client.cc           |  2 +
 src/kudu/client/meta_cache.cc       | 26 +++++++++----
 src/kudu/client/scanner-internal.cc |  9 ++++-
 src/kudu/client/table-internal.cc   |  5 ++-
 7 files changed, 80 insertions(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/ab2c673d/src/kudu/client/client-internal.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/client-internal.cc b/src/kudu/client/client-internal.cc
index 7c03aaf..4ed4a6c 100644
--- a/src/kudu/client/client-internal.cc
+++ b/src/kudu/client/client-internal.cc
@@ -131,6 +131,7 @@ Status KuduClient::Data::SyncLeaderMasterRpc(
     const ReqClass& req,
     RespClass* resp,
     int* num_attempts,
+    const char* func_name,
     const boost::function<Status(MasterServiceProxy*,
                                  const ReqClass&,
                                  RespClass*,
@@ -143,7 +144,8 @@ Status KuduClient::Data::SyncLeaderMasterRpc(
     // Have we already exceeded our deadline?
     MonoTime now = MonoTime::Now(MonoTime::FINE);
     if (deadline.ComesBefore(now)) {
-      return Status::TimedOut("timed out waiting for a reply from a leader master");
+      return Status::TimedOut(Substitute("$0 timed out after deadline expired",
+                                         func_name));
     }
 
     // The RPC's deadline is intentionally earlier than the overall
@@ -171,15 +173,21 @@ Status KuduClient::Data::SyncLeaderMasterRpc(
       }
     }
 
-    if (s.IsTimedOut() && MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) {
-      LOG(WARNING) << "Unable to send the request (" << req.ShortDebugString()
-                   << ") to leader Master (" << leader_master_hostport().ToString()
-                   << "): " << s.ToString();
-      if (client->IsMultiMaster()) {
-        LOG(INFO) << "Determining the new leader Master and retrying...";
-        WARN_NOT_OK(SetMasterServerProxy(client, deadline),
-                    "Unable to determine the new leader Master");
-        continue;
+    if (s.IsTimedOut()) {
+      if (MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) {
+        LOG(WARNING) << "Unable to send the request (" << req.ShortDebugString()
+                     << ") to leader Master (" << leader_master_hostport().ToString()
+                     << "): " << s.ToString();
+        if (client->IsMultiMaster()) {
+          LOG(INFO) << "Determining the new leader Master and retrying...";
+          WARN_NOT_OK(SetMasterServerProxy(client, deadline),
+                      "Unable to determine the new leader Master");
+          continue;
+        }
+      } else {
+        // Operation deadline expired during this latest RPC.
+        s = s.CloneAndPrepend(Substitute("$0 timed out after deadline expired",
+                                         func_name));
       }
     }
 
@@ -206,6 +214,7 @@ Status KuduClient::Data::SyncLeaderMasterRpc(
     const ListTablesRequestPB& req,
     ListTablesResponsePB* resp,
     int* num_attempts,
+    const char* func_name,
     const boost::function<Status(MasterServiceProxy*,
                                  const ListTablesRequestPB&,
                                  ListTablesResponsePB*,
@@ -217,6 +226,7 @@ Status KuduClient::Data::SyncLeaderMasterRpc(
     const ListTabletServersRequestPB& req,
     ListTabletServersResponsePB* resp,
     int* num_attempts,
+    const char* func_name,
     const boost::function<Status(MasterServiceProxy*,
                                  const ListTabletServersRequestPB&,
                                  ListTabletServersResponsePB*,
@@ -330,7 +340,7 @@ Status KuduClient::Data::CreateTable(KuduClient* client,
 
   int attempts = 0;
   Status s = SyncLeaderMasterRpc<CreateTableRequestPB, CreateTableResponsePB>(
-      deadline, client, req, &resp, &attempts, &MasterServiceProxy::CreateTable);
+      deadline, client, req, &resp, &attempts, "CreateTable", &MasterServiceProxy::CreateTable);
   RETURN_NOT_OK(s);
   if (resp.has_error()) {
     if (resp.error().code() == MasterErrorPB::TABLE_ALREADY_PRESENT && attempts > 1) {
@@ -391,6 +401,7 @@ Status KuduClient::Data::IsCreateTableInProgress(KuduClient* client,
           req,
           &resp,
           nullptr,
+          "IsCreateTableDone",
           &MasterServiceProxy::IsCreateTableDone);
   // RETURN_NOT_OK macro can't take templated function call as param,
   // and SyncLeaderMasterRpc must be explicitly instantiated, else the
@@ -424,7 +435,7 @@ Status KuduClient::Data::DeleteTable(KuduClient* client,
   req.mutable_table()->set_table_name(table_name);
   Status s = SyncLeaderMasterRpc<DeleteTableRequestPB, DeleteTableResponsePB>(
       deadline, client, req, &resp,
-      &attempts, &MasterServiceProxy::DeleteTable);
+      &attempts, "DeleteTable", &MasterServiceProxy::DeleteTable);
   RETURN_NOT_OK(s);
   if (resp.has_error()) {
     if (resp.error().code() == MasterErrorPB::TABLE_NOT_FOUND && attempts > 1) {
@@ -449,6 +460,7 @@ Status KuduClient::Data::AlterTable(KuduClient* client,
           req,
           &resp,
           nullptr,
+          "AlterTable",
           &MasterServiceProxy::AlterTable);
   RETURN_NOT_OK(s);
   // TODO: Consider the situation where the request is sent to the
@@ -477,6 +489,7 @@ Status KuduClient::Data::IsAlterTableInProgress(KuduClient* client,
           req,
           &resp,
           nullptr,
+          "IsAlterTableDone",
           &MasterServiceProxy::IsAlterTableDone);
   RETURN_NOT_OK(s);
   if (resp.has_error()) {
@@ -604,8 +617,14 @@ GetTableSchemaRpc::~GetTableSchemaRpc() {
 }
 
 void GetTableSchemaRpc::SendRpc() {
+  MonoTime now = MonoTime::Now(MonoTime::FINE);
+  if (retrier().deadline().ComesBefore(now)) {
+    SendRpcCb(Status::TimedOut("GetTableSchema timed out after deadline expired"));
+    return;
+  }
+
   // See KuduClient::Data::SyncLeaderMasterRpc().
-  MonoTime rpc_deadline = MonoTime::Now(MonoTime::FINE);
+  MonoTime rpc_deadline = now;
   rpc_deadline.AddDelta(client_->default_rpc_timeout());
   mutable_retrier()->mutable_controller()->set_deadline(
       MonoTime::Earliest(rpc_deadline, retrier().deadline()));
@@ -661,14 +680,19 @@ void GetTableSchemaRpc::SendRpcCb(const Status& status) {
     new_status = StatusFromPB(resp_.error().status());
   }
 
-  if (new_status.IsTimedOut() &&
-      MonoTime::Now(MonoTime::FINE).ComesBefore(retrier().deadline())) {
-    if (client_->IsMultiMaster()) {
-      LOG(WARNING) << "Leader Master ("
-                   << client_->data_->leader_master_hostport().ToString()
-                   << ") timed out, re-trying...";
-      ResetLeaderMasterAndRetry();
-      return;
+  if (new_status.IsTimedOut()) {
+    if (MonoTime::Now(MonoTime::FINE).ComesBefore(retrier().deadline())) {
+      if (client_->IsMultiMaster()) {
+        LOG(WARNING) << "Leader Master ("
+            << client_->data_->leader_master_hostport().ToString()
+            << ") timed out, re-trying...";
+        ResetLeaderMasterAndRetry();
+        return;
+      }
+    } else {
+      // Operation deadline expired during this latest RPC.
+      new_status = new_status.CloneAndPrepend(
+          "GetTableSchema timed out after deadline expired");
     }
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/ab2c673d/src/kudu/client/client-internal.h
----------------------------------------------------------------------
diff --git a/src/kudu/client/client-internal.h b/src/kudu/client/client-internal.h
index fa04b9c..9571616 100644
--- a/src/kudu/client/client-internal.h
+++ b/src/kudu/client/client-internal.h
@@ -172,7 +172,8 @@ class KuduClient::Data {
   //
   // NOTE: 'rpc_timeout' is a per-call timeout, while 'deadline' is a
   // per operation deadline. If 'deadline' is not initialized, 'func' is
-  // retried forever.
+  // retried forever. If 'deadline' expires, 'func_name' is included in
+  // the resulting Status.
   template<class ReqClass, class RespClass>
   Status SyncLeaderMasterRpc(
       const MonoTime& deadline,
@@ -180,6 +181,7 @@ class KuduClient::Data {
       const ReqClass& req,
       RespClass* resp,
       int* num_attempts,
+      const char* func_name,
       const boost::function<Status(master::MasterServiceProxy*,
                                    const ReqClass&, RespClass*,
                                    rpc::RpcController*)>& func);

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/ab2c673d/src/kudu/client/client-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/client-test.cc b/src/kudu/client/client-test.cc
index 032000b..fba4e00 100644
--- a/src/kudu/client/client-test.cc
+++ b/src/kudu/client/client-test.cc
@@ -1459,7 +1459,7 @@ TEST_F(ClientTest, TestWriteTimeout) {
     ASSERT_TRUE(error->status().IsTimedOut()) << error->status().ToString();
     ASSERT_STR_CONTAINS(error->status().ToString(),
                         "GetTableLocations(client-testtb, int32 key=1, 1) "
-                        "failed: GetTableLocations RPC");
+                        "failed: timed out after deadline expired");
   }
 
   // Next time out the actual write on the tablet server.

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/ab2c673d/src/kudu/client/client.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/client.cc b/src/kudu/client/client.cc
index 0ae3fca..1fbfb83 100644
--- a/src/kudu/client/client.cc
+++ b/src/kudu/client/client.cc
@@ -292,6 +292,7 @@ Status KuduClient::ListTabletServers(vector<KuduTabletServer*>* tablet_servers)
           req,
           &resp,
           nullptr,
+          "ListTabletServers",
           &MasterServiceProxy::ListTabletServers);
   RETURN_NOT_OK(s);
   if (resp.has_error()) {
@@ -324,6 +325,7 @@ Status KuduClient::ListTables(vector<string>* tables,
           req,
           &resp,
           nullptr,
+          "ListTables",
           &MasterServiceProxy::ListTables);
   RETURN_NOT_OK(s);
   if (resp.has_error()) {

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/ab2c673d/src/kudu/client/meta_cache.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/meta_cache.cc b/src/kudu/client/meta_cache.cc
index 417be2e..61ae145 100644
--- a/src/kudu/client/meta_cache.cc
+++ b/src/kudu/client/meta_cache.cc
@@ -433,7 +433,12 @@ void LookupRpc::SendRpc() {
   // some additional tablets.
 
   // See KuduClient::Data::SyncLeaderMasterRpc().
-  MonoTime rpc_deadline = MonoTime::Now(MonoTime::FINE);
+  MonoTime now = MonoTime::Now(MonoTime::FINE);
+  if (retrier().deadline().ComesBefore(now)) {
+    SendRpcCb(Status::TimedOut("timed out after deadline expired"));
+    return;
+  }
+  MonoTime rpc_deadline = now;
   rpc_deadline.AddDelta(meta_cache_->client_->default_rpc_timeout());
   mutable_retrier()->mutable_controller()->set_deadline(
       MonoTime::Earliest(rpc_deadline, retrier().deadline()));
@@ -493,13 +498,18 @@ void LookupRpc::SendRpcCb(const Status& status) {
     new_status = StatusFromPB(resp_.error().status());
   }
 
-  if (new_status.IsTimedOut() &&
-      MonoTime::Now(MonoTime::FINE).ComesBefore(retrier().deadline())) {
-    if (meta_cache_->client_->IsMultiMaster()) {
-      LOG(WARNING) << "Leader Master timed out, re-trying...";
-      ResetMasterLeaderAndRetry();
-      ignore_result(delete_me.release());
-      return;
+  if (new_status.IsTimedOut()) {
+    if (MonoTime::Now(MonoTime::FINE).ComesBefore(retrier().deadline())) {
+      if (meta_cache_->client_->IsMultiMaster()) {
+        LOG(WARNING) << "Leader Master timed out, re-trying...";
+        ResetMasterLeaderAndRetry();
+        ignore_result(delete_me.release());
+        return;
+      }
+    } else {
+      // Operation deadline expired during this latest RPC.
+      new_status = new_status.CloneAndPrepend(
+          "timed out after deadline expired");
     }
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/ab2c673d/src/kudu/client/scanner-internal.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/scanner-internal.cc b/src/kudu/client/scanner-internal.cc
index 3a2ec86..a25a486 100644
--- a/src/kudu/client/scanner-internal.cc
+++ b/src/kudu/client/scanner-internal.cc
@@ -316,6 +316,13 @@ Status KuduScanner::Data::OpenTablet(const string& partition_key,
     }
     RETURN_NOT_OK(lookup_status);
 
+    MonoTime now = MonoTime::Now(MonoTime::FINE);
+    if (deadline.ComesBefore(now)) {
+      Status ret = Status::TimedOut("Scan timed out, deadline expired");
+      return last_error_.ok() ?
+          ret : ret.CloneAndAppend(last_error_.ToString());
+    }
+
     // Recalculate the deadlines.
     // If we have other replicas beyond this one to try, then we'll try to
     // open the scanner with the default RPC timeout. That gives us time to
@@ -323,7 +330,7 @@ Status KuduScanner::Data::OpenTablet(const string& partition_key,
     // full remaining deadline for the user's call.
     MonoTime rpc_deadline;
     if (static_cast<int>(candidates.size()) - blacklist->size() > 1) {
-      rpc_deadline = MonoTime::Now(MonoTime::FINE);
+      rpc_deadline = now;
       rpc_deadline.AddDelta(table_->client()->default_rpc_timeout());
       rpc_deadline = MonoTime::Earliest(deadline, rpc_deadline);
     } else {

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/ab2c673d/src/kudu/client/table-internal.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/table-internal.cc b/src/kudu/client/table-internal.cc
index e603f49..ca97480 100644
--- a/src/kudu/client/table-internal.cc
+++ b/src/kudu/client/table-internal.cc
@@ -73,8 +73,7 @@ Status KuduTable::Data::Open() {
     // Have we already exceeded our deadline?
     MonoTime now = MonoTime::Now(MonoTime::FINE);
     if (deadline.ComesBefore(now)) {
-      const char* msg = "Timed out waiting for non-empty GetTableLocations "
-          "reply from a leader Master";
+      const char* msg = "OpenTable timed out after deadline expired";
       LOG(ERROR) << msg;
       return Status::TimedOut(msg);
     }
@@ -104,6 +103,8 @@ Status KuduTable::Data::Open() {
 
       if (s.IsTimedOut()
           && MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) {
+        // If the RPC timed out and the operation deadline expired, we'll loop
+        // again and time out for good above.
         LOG(WARNING) << "Timed out talking to the leader master ("
                      << client_->data_->leader_master_hostport().ToString() << "): "
                      << s.ToString();