You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by wd...@apache.org on 2018/05/24 22:11:32 UTC

[2/2] kudu git commit: KUDU-2426 Fix WRONG_SERVER_UUID case in ksck

KUDU-2426 Fix WRONG_SERVER_UUID case in ksck

RemoteKsckTabletServer::FetchInfo() returned a Status::RemoteError since
KUDU-2364 to indicate a UUID mismatch.
Ksck::FetchInfoFromTabletServers() checked for this Status, and this
resulted in any RemoteError showing as a WRONG_SERVER_UUID in the
tablet server health list.

This commit adds KsckServerHealth output argument to FetchInfo() and
FetchConsensusState() which can be used to add further context to a
non-OK Status.

Change-Id: I2b4f50fe4dd94450b4f2e34dbad315bd761b071f
Reviewed-on: http://gerrit.cloudera.org:8080/10293
Tested-by: Kudu Jenkins
Reviewed-by: Andrew Wong <aw...@cloudera.com>
Reviewed-by: Alexey Serbin <as...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/2f61b72c
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/2f61b72c
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/2f61b72c

Branch: refs/heads/master
Commit: 2f61b72ca470eab9ee73f7c445582b50a9435ca7
Parents: 70fcb5e
Author: Attila Bukor <ab...@cloudera.com>
Authored: Thu May 24 11:36:49 2018 +0200
Committer: Will Berkeley <wd...@gmail.com>
Committed: Thu May 24 22:10:08 2018 +0000

----------------------------------------------------------------------
 src/kudu/tools/ksck-test.cc        | 16 ++++++++++++----
 src/kudu/tools/ksck.cc             | 13 ++++---------
 src/kudu/tools/ksck.h              | 17 +++++++++++++----
 src/kudu/tools/ksck_remote-test.cc | 19 ++++++++++++++-----
 src/kudu/tools/ksck_remote.cc      | 17 +++++++++++++----
 src/kudu/tools/ksck_remote.h       |  6 ++++--
 src/kudu/tools/ksck_results.cc     |  3 ++-
 7 files changed, 62 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck-test.cc b/src/kudu/tools/ksck-test.cc
index fd83ab5..eff2609 100644
--- a/src/kudu/tools/ksck-test.cc
+++ b/src/kudu/tools/ksck-test.cc
@@ -101,10 +101,13 @@ class MockKsckTabletServer : public KsckTabletServer {
   explicit MockKsckTabletServer(const string& uuid)
       : KsckTabletServer(uuid),
         fetch_info_status_(Status::OK()),
+        fetch_info_health_(KsckServerHealth::HEALTHY),
         address_("<mock>") {
   }
 
-  Status FetchInfo() override {
+  Status FetchInfo(KsckServerHealth* health) override {
+    CHECK(health);
+    *health = fetch_info_health_;
     timestamp_ = 12345;
     if (fetch_info_status_.ok()) {
       state_ = KsckFetchState::FETCHED;
@@ -114,7 +117,7 @@ class MockKsckTabletServer : public KsckTabletServer {
     return fetch_info_status_;
   }
 
-  Status FetchConsensusState() override {
+  Status FetchConsensusState(KsckServerHealth* /*health*/) override {
     return Status::OK();
   }
 
@@ -131,8 +134,9 @@ class MockKsckTabletServer : public KsckTabletServer {
     return address_;
   }
 
-  // Public because the unit tests mutate this variable directly.
+  // Public because the unit tests mutate these variables directly.
   Status fetch_info_status_;
+  KsckServerHealth fetch_info_health_;
 
  private:
   const string address_;
@@ -828,6 +832,8 @@ TEST_F(KsckTest, TestWrongUUIDTabletServer) {
                                      "doesn't match the expected ID");
   static_pointer_cast<MockKsckTabletServer>(cluster_->tablet_servers_["ts-id-1"])
     ->fetch_info_status_ = error;
+  static_pointer_cast<MockKsckTabletServer>(cluster_->tablet_servers_["ts-id-1"])
+    ->fetch_info_health_ = KsckServerHealth::WRONG_SERVER_UUID;
 
   ASSERT_OK(ksck_->CheckClusterRunning());
   ASSERT_OK(ksck_->FetchTableAndTabletInfo());
@@ -851,6 +857,8 @@ TEST_F(KsckTest, TestBadTabletServer) {
   Status error = Status::NetworkError("Network failure");
   static_pointer_cast<MockKsckTabletServer>(cluster_->tablet_servers_["ts-id-1"])
       ->fetch_info_status_ = error;
+  static_pointer_cast<MockKsckTabletServer>(cluster_->tablet_servers_["ts-id-1"])
+      ->fetch_info_health_ = KsckServerHealth::UNAVAILABLE;
 
   ASSERT_OK(ksck_->CheckClusterRunning());
   ASSERT_OK(ksck_->FetchTableAndTabletInfo());
@@ -870,7 +878,7 @@ TEST_F(KsckTest, TestBadTabletServer) {
       " ts-id-1 | <mock>  | UNAVAILABLE\n");
   ASSERT_STR_CONTAINS(
       err_stream_.str(),
-      "Error from <mock>: Network error: Network failure\n");
+      "Error from <mock>: Network error: Network failure (UNAVAILABLE)\n");
   ASSERT_STR_CONTAINS(
       err_stream_.str(),
       "Tablet tablet-id-0 of table 'test' is under-replicated: 1 replica(s) not RUNNING\n"

http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc
index 48a0814..5e36f79 100644
--- a/src/kudu/tools/ksck.cc
+++ b/src/kudu/tools/ksck.cc
@@ -292,9 +292,10 @@ Status Ksck::FetchInfoFromTabletServers() {
     const auto& ts = entry.second;
     CHECK_OK(pool->SubmitFunc([&]() {
           VLOG(1) << "Going to connect to tablet server: " << ts->uuid();
-          Status s = ts->FetchInfo().AndThen([&ts]() {
+          KsckServerHealth health;
+          Status s = ts->FetchInfo(&health).AndThen([&ts, &health]() {
                 if (FLAGS_consensus) {
-                  return ts->FetchConsensusState();
+                  return ts->FetchConsensusState(&health);
                 }
                 return Status::OK();
               });
@@ -304,14 +305,8 @@ Status Ksck::FetchInfoFromTabletServers() {
           summary.status = s;
           if (!s.ok()) {
             bad_servers.Increment();
-            if (s.IsRemoteError()) {
-              summary.health = KsckServerHealth::WRONG_SERVER_UUID;
-            } else {
-              summary.health = KsckServerHealth::UNAVAILABLE;
-              }
-          } else {
-            summary.health = KsckServerHealth::HEALTHY;
           }
+          summary.health = health;
 
           std::lock_guard<simple_spinlock> lock(tablet_server_summaries_lock);
           tablet_server_summaries.push_back(std::move(summary));

http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck.h
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.h b/src/kudu/tools/ksck.h
index 77b3d3c..c2bed92 100644
--- a/src/kudu/tools/ksck.h
+++ b/src/kudu/tools/ksck.h
@@ -281,11 +281,20 @@ class KsckTabletServer {
   explicit KsckTabletServer(std::string uuid) : uuid_(std::move(uuid)) {}
   virtual ~KsckTabletServer() { }
 
-  // Connects to the configured tablet server and populates the fields of this class.
-  virtual Status FetchInfo() = 0;
+  // Connects to the configured tablet server and populates the fields of this class. 'health' must
+  // not be nullptr.
+  //
+  // If Status is OK, 'health' will be HEALTHY
+  // If the UUID is not what ksck expects, 'health' will be WRONG_SERVER_UUID
+  // Otherwise 'health' will be UNAVAILABLE
+  virtual Status FetchInfo(KsckServerHealth* health) = 0;
 
-  // Connects to the configured tablet server and populates the consensus map.
-  virtual Status FetchConsensusState() = 0;
+  // Connects to the configured tablet server and populates the consensus map. 'health' must not be
+  // nullptr.
+  //
+  // If Status is OK, 'health' will be HEALTHY
+  // Otherwise 'health' will be UNAVAILABLE
+  virtual Status FetchConsensusState(KsckServerHealth* health) = 0;
 
   // Executes a checksum scan on the associated tablet, and runs the callback
   // with the result. The callback must be threadsafe and non-blocking.

http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck_remote-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote-test.cc b/src/kudu/tools/ksck_remote-test.cc
index 59ff953..b807759 100644
--- a/src/kudu/tools/ksck_remote-test.cc
+++ b/src/kudu/tools/ksck_remote-test.cc
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include "kudu/tools/ksck_remote.h"
+
 #include <cstdint>
 #include <memory>
 #include <sstream>
@@ -27,8 +29,8 @@
 #include <gtest/gtest.h>
 
 #include "kudu/client/client.h"
-#include "kudu/client/shared_ptr.h"
 #include "kudu/client/schema.h"
+#include "kudu/client/shared_ptr.h"
 #include "kudu/client/write_op.h"
 #include "kudu/common/partial_row.h"
 #include "kudu/gutil/gscoped_ptr.h"
@@ -39,12 +41,12 @@
 #include "kudu/mini-cluster/internal_mini_cluster.h"
 #include "kudu/tools/data_gen_util.h"
 #include "kudu/tools/ksck.h"
-#include "kudu/tools/ksck_remote.h"
 #include "kudu/tserver/mini_tablet_server.h"
 #include "kudu/util/atomic.h"
 #include "kudu/util/countdown_latch.h"
 #include "kudu/util/monotime.h"
 #include "kudu/util/net/net_util.h"
+#include "kudu/util/net/sockaddr.h"
 #include "kudu/util/promise.h"
 #include "kudu/util/random.h"
 #include "kudu/util/status.h"
@@ -238,9 +240,16 @@ TEST_F(RemoteKsckTest, TestTabletServerMismatchedUUID) {
   ASSERT_TRUE(ksck_->FetchInfoFromTabletServers().IsNetworkError());
   ASSERT_OK(ksck_->PrintResults());
 
-  string match_string = "Remote error: ID reported by tablet server "
-                        "($0) doesn't match the expected ID: $1";
-  ASSERT_STR_CONTAINS(err_stream_.str(), strings::Substitute(match_string, new_uuid, old_uuid));
+  string match_string = "Error from $0: Remote error: ID reported by tablet server "
+                        "($1) doesn't match the expected ID: $2 (WRONG_SERVER_UUID)";
+  ASSERT_STR_CONTAINS(err_stream_.str(), strings::Substitute(match_string, address.ToString(),
+                                                             new_uuid, old_uuid));
+  tserver::MiniTabletServer* ts = mini_cluster_->mini_tablet_server(1);
+  ASSERT_STR_CONTAINS(err_stream_.str(), strings::Substitute("$0 | $1 | HEALTHY", ts->uuid(),
+                                                             ts->bound_rpc_addr().ToString()));
+  ts = mini_cluster_->mini_tablet_server(2);
+  ASSERT_STR_CONTAINS(err_stream_.str(), strings::Substitute("$0 | $1 | HEALTHY", ts->uuid(),
+                                                             ts->bound_rpc_addr().ToString()));
 }
 
 TEST_F(RemoteKsckTest, TestTableConsistency) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck_remote.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote.cc b/src/kudu/tools/ksck_remote.cc
index 017321e..3c92eca 100644
--- a/src/kudu/tools/ksck_remote.cc
+++ b/src/kudu/tools/ksck_remote.cc
@@ -18,6 +18,7 @@
 #include "kudu/tools/ksck_remote.h"
 
 #include <cstdint>
+#include <map>
 #include <ostream>
 #include <unordered_map>
 
@@ -28,8 +29,8 @@
 #include <glog/logging.h>
 
 #include "kudu/client/client.h"
-#include "kudu/client/schema.h"
 #include "kudu/client/replica_controller-internal.h"
+#include "kudu/client/schema.h"
 #include "kudu/common/common.pb.h"
 #include "kudu/common/schema.h"
 #include "kudu/common/wire_protocol.h"
@@ -50,6 +51,8 @@
 #include "kudu/server/server_base.pb.h"
 #include "kudu/server/server_base.proxy.h"
 #include "kudu/tablet/tablet.pb.h"
+#include "kudu/tools/ksck.h"
+#include "kudu/tools/ksck_results.h"
 #include "kudu/tserver/tablet_server.h"
 #include "kudu/tserver/tserver.pb.h"
 #include "kudu/tserver/tserver_service.pb.h"
@@ -150,9 +153,10 @@ Status RemoteKsckTabletServer::Init() {
   return Status::OK();
 }
 
-Status RemoteKsckTabletServer::FetchInfo() {
+Status RemoteKsckTabletServer::FetchInfo(KsckServerHealth* health) {
+  DCHECK(health);
   state_ = KsckFetchState::FETCH_FAILED;
-
+  *health = KsckServerHealth::UNAVAILABLE;
   {
     server::GetStatusRequestPB req;
     server::GetStatusResponsePB resp;
@@ -162,6 +166,7 @@ Status RemoteKsckTabletServer::FetchInfo() {
                           "could not get status from server");
     string response_uuid = resp.status().node_instance().permanent_uuid();
     if (response_uuid != uuid()) {
+      *health = KsckServerHealth::WRONG_SERVER_UUID;
       return Status::RemoteError(Substitute("ID reported by tablet server ($0) doesn't "
                                  "match the expected ID: $1",
                                  response_uuid, uuid()));
@@ -197,10 +202,13 @@ Status RemoteKsckTabletServer::FetchInfo() {
   }
 
   state_ = KsckFetchState::FETCHED;
+  *health = KsckServerHealth::HEALTHY;
   return Status::OK();
 }
 
-Status RemoteKsckTabletServer::FetchConsensusState() {
+Status RemoteKsckTabletServer::FetchConsensusState(KsckServerHealth* health) {
+  DCHECK(health);
+  *health = KsckServerHealth::UNAVAILABLE;
   tablet_consensus_state_map_.clear();
   consensus::GetConsensusStateRequestPB req;
   consensus::GetConsensusStateResponsePB resp;
@@ -219,6 +227,7 @@ Status RemoteKsckTabletServer::FetchConsensusState() {
     }
   }
 
+  *health = KsckServerHealth::HEALTHY;
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck_remote.h
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote.h b/src/kudu/tools/ksck_remote.h
index a718a8f..0ceda93 100644
--- a/src/kudu/tools/ksck_remote.h
+++ b/src/kudu/tools/ksck_remote.h
@@ -54,6 +54,8 @@ class TabletServerServiceProxy;
 
 namespace tools {
 
+enum class KsckServerHealth;
+
 // This implementation connects to a master via RPC.
 class RemoteKsckMaster : public KsckMaster {
  public:
@@ -93,9 +95,9 @@ class RemoteKsckTabletServer : public KsckTabletServer {
   // Must be called after constructing.
   Status Init();
 
-  Status FetchInfo() override;
+  Status FetchInfo(KsckServerHealth* health) override;
 
-  Status FetchConsensusState() override;
+  Status FetchConsensusState(KsckServerHealth* health) override;
 
   void RunTabletChecksumScanAsync(
       const std::string& tablet_id,

http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck_results.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_results.cc b/src/kudu/tools/ksck_results.cc
index adcbfd7..a3b01af 100644
--- a/src/kudu/tools/ksck_results.cc
+++ b/src/kudu/tools/ksck_results.cc
@@ -295,7 +295,8 @@ Status PrintServerHealthSummaries(KsckServerType type,
   // This isn't done as part of the table because the messages can be quite long.
   for (const auto& s : summaries) {
     if (s.health == KsckServerHealth::HEALTHY) continue;
-    out << Substitute("Error from $1: $2", s.uuid, s.address, s.status.ToString()) << endl;
+    out << Substitute("Error from $0: $1 ($2)", s.address, s.status.ToString(),
+                      ServerHealthToString(s.health)) << endl;
   }
   return Status::OK();
 }