You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by wd...@apache.org on 2018/05/24 22:11:32 UTC
[2/2] kudu git commit: KUDU-2426 Fix WRONG_SERVER_UUID case in ksck
KUDU-2426 Fix WRONG_SERVER_UUID case in ksck
RemoteKsckTabletServer::FetchInfo() returned a Status::RemoteError since
KUDU-2364 to indicate a UUID mismatch.
Ksck::FetchInfoFromTabletServers() checked for this Status, and this
resulted in any RemoteError showing as a WRONG_SERVER_UUID in the
tablet server health list.
This commit adds KsckServerHealth output argument to FetchInfo() and
FetchConsensusState() which can be used to add further context to a
non-OK Status.
Change-Id: I2b4f50fe4dd94450b4f2e34dbad315bd761b071f
Reviewed-on: http://gerrit.cloudera.org:8080/10293
Tested-by: Kudu Jenkins
Reviewed-by: Andrew Wong <aw...@cloudera.com>
Reviewed-by: Alexey Serbin <as...@cloudera.com>
Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/2f61b72c
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/2f61b72c
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/2f61b72c
Branch: refs/heads/master
Commit: 2f61b72ca470eab9ee73f7c445582b50a9435ca7
Parents: 70fcb5e
Author: Attila Bukor <ab...@cloudera.com>
Authored: Thu May 24 11:36:49 2018 +0200
Committer: Will Berkeley <wd...@gmail.com>
Committed: Thu May 24 22:10:08 2018 +0000
----------------------------------------------------------------------
src/kudu/tools/ksck-test.cc | 16 ++++++++++++----
src/kudu/tools/ksck.cc | 13 ++++---------
src/kudu/tools/ksck.h | 17 +++++++++++++----
src/kudu/tools/ksck_remote-test.cc | 19 ++++++++++++++-----
src/kudu/tools/ksck_remote.cc | 17 +++++++++++++----
src/kudu/tools/ksck_remote.h | 6 ++++--
src/kudu/tools/ksck_results.cc | 3 ++-
7 files changed, 62 insertions(+), 29 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck-test.cc b/src/kudu/tools/ksck-test.cc
index fd83ab5..eff2609 100644
--- a/src/kudu/tools/ksck-test.cc
+++ b/src/kudu/tools/ksck-test.cc
@@ -101,10 +101,13 @@ class MockKsckTabletServer : public KsckTabletServer {
explicit MockKsckTabletServer(const string& uuid)
: KsckTabletServer(uuid),
fetch_info_status_(Status::OK()),
+ fetch_info_health_(KsckServerHealth::HEALTHY),
address_("<mock>") {
}
- Status FetchInfo() override {
+ Status FetchInfo(KsckServerHealth* health) override {
+ CHECK(health);
+ *health = fetch_info_health_;
timestamp_ = 12345;
if (fetch_info_status_.ok()) {
state_ = KsckFetchState::FETCHED;
@@ -114,7 +117,7 @@ class MockKsckTabletServer : public KsckTabletServer {
return fetch_info_status_;
}
- Status FetchConsensusState() override {
+ Status FetchConsensusState(KsckServerHealth* /*health*/) override {
return Status::OK();
}
@@ -131,8 +134,9 @@ class MockKsckTabletServer : public KsckTabletServer {
return address_;
}
- // Public because the unit tests mutate this variable directly.
+ // Public because the unit tests mutate these variables directly.
Status fetch_info_status_;
+ KsckServerHealth fetch_info_health_;
private:
const string address_;
@@ -828,6 +832,8 @@ TEST_F(KsckTest, TestWrongUUIDTabletServer) {
"doesn't match the expected ID");
static_pointer_cast<MockKsckTabletServer>(cluster_->tablet_servers_["ts-id-1"])
->fetch_info_status_ = error;
+ static_pointer_cast<MockKsckTabletServer>(cluster_->tablet_servers_["ts-id-1"])
+ ->fetch_info_health_ = KsckServerHealth::WRONG_SERVER_UUID;
ASSERT_OK(ksck_->CheckClusterRunning());
ASSERT_OK(ksck_->FetchTableAndTabletInfo());
@@ -851,6 +857,8 @@ TEST_F(KsckTest, TestBadTabletServer) {
Status error = Status::NetworkError("Network failure");
static_pointer_cast<MockKsckTabletServer>(cluster_->tablet_servers_["ts-id-1"])
->fetch_info_status_ = error;
+ static_pointer_cast<MockKsckTabletServer>(cluster_->tablet_servers_["ts-id-1"])
+ ->fetch_info_health_ = KsckServerHealth::UNAVAILABLE;
ASSERT_OK(ksck_->CheckClusterRunning());
ASSERT_OK(ksck_->FetchTableAndTabletInfo());
@@ -870,7 +878,7 @@ TEST_F(KsckTest, TestBadTabletServer) {
" ts-id-1 | <mock> | UNAVAILABLE\n");
ASSERT_STR_CONTAINS(
err_stream_.str(),
- "Error from <mock>: Network error: Network failure\n");
+ "Error from <mock>: Network error: Network failure (UNAVAILABLE)\n");
ASSERT_STR_CONTAINS(
err_stream_.str(),
"Tablet tablet-id-0 of table 'test' is under-replicated: 1 replica(s) not RUNNING\n"
http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc
index 48a0814..5e36f79 100644
--- a/src/kudu/tools/ksck.cc
+++ b/src/kudu/tools/ksck.cc
@@ -292,9 +292,10 @@ Status Ksck::FetchInfoFromTabletServers() {
const auto& ts = entry.second;
CHECK_OK(pool->SubmitFunc([&]() {
VLOG(1) << "Going to connect to tablet server: " << ts->uuid();
- Status s = ts->FetchInfo().AndThen([&ts]() {
+ KsckServerHealth health;
+ Status s = ts->FetchInfo(&health).AndThen([&ts, &health]() {
if (FLAGS_consensus) {
- return ts->FetchConsensusState();
+ return ts->FetchConsensusState(&health);
}
return Status::OK();
});
@@ -304,14 +305,8 @@ Status Ksck::FetchInfoFromTabletServers() {
summary.status = s;
if (!s.ok()) {
bad_servers.Increment();
- if (s.IsRemoteError()) {
- summary.health = KsckServerHealth::WRONG_SERVER_UUID;
- } else {
- summary.health = KsckServerHealth::UNAVAILABLE;
- }
- } else {
- summary.health = KsckServerHealth::HEALTHY;
}
+ summary.health = health;
std::lock_guard<simple_spinlock> lock(tablet_server_summaries_lock);
tablet_server_summaries.push_back(std::move(summary));
http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck.h
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.h b/src/kudu/tools/ksck.h
index 77b3d3c..c2bed92 100644
--- a/src/kudu/tools/ksck.h
+++ b/src/kudu/tools/ksck.h
@@ -281,11 +281,20 @@ class KsckTabletServer {
explicit KsckTabletServer(std::string uuid) : uuid_(std::move(uuid)) {}
virtual ~KsckTabletServer() { }
- // Connects to the configured tablet server and populates the fields of this class.
- virtual Status FetchInfo() = 0;
+ // Connects to the configured tablet server and populates the fields of this class. 'health' must
+ // not be nullptr.
+ //
+ // If Status is OK, 'health' will be HEALTHY
+ // If the UUID is not what ksck expects, 'health' will be WRONG_SERVER_UUID
+ // Otherwise 'health' will be UNAVAILABLE
+ virtual Status FetchInfo(KsckServerHealth* health) = 0;
- // Connects to the configured tablet server and populates the consensus map.
- virtual Status FetchConsensusState() = 0;
+ // Connects to the configured tablet server and populates the consensus map. 'health' must not be
+ // nullptr.
+ //
+ // If Status is OK, 'health' will be HEALTHY
+ // Otherwise 'health' will be UNAVAILABLE
+ virtual Status FetchConsensusState(KsckServerHealth* health) = 0;
// Executes a checksum scan on the associated tablet, and runs the callback
// with the result. The callback must be threadsafe and non-blocking.
http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck_remote-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote-test.cc b/src/kudu/tools/ksck_remote-test.cc
index 59ff953..b807759 100644
--- a/src/kudu/tools/ksck_remote-test.cc
+++ b/src/kudu/tools/ksck_remote-test.cc
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+#include "kudu/tools/ksck_remote.h"
+
#include <cstdint>
#include <memory>
#include <sstream>
@@ -27,8 +29,8 @@
#include <gtest/gtest.h>
#include "kudu/client/client.h"
-#include "kudu/client/shared_ptr.h"
#include "kudu/client/schema.h"
+#include "kudu/client/shared_ptr.h"
#include "kudu/client/write_op.h"
#include "kudu/common/partial_row.h"
#include "kudu/gutil/gscoped_ptr.h"
@@ -39,12 +41,12 @@
#include "kudu/mini-cluster/internal_mini_cluster.h"
#include "kudu/tools/data_gen_util.h"
#include "kudu/tools/ksck.h"
-#include "kudu/tools/ksck_remote.h"
#include "kudu/tserver/mini_tablet_server.h"
#include "kudu/util/atomic.h"
#include "kudu/util/countdown_latch.h"
#include "kudu/util/monotime.h"
#include "kudu/util/net/net_util.h"
+#include "kudu/util/net/sockaddr.h"
#include "kudu/util/promise.h"
#include "kudu/util/random.h"
#include "kudu/util/status.h"
@@ -238,9 +240,16 @@ TEST_F(RemoteKsckTest, TestTabletServerMismatchedUUID) {
ASSERT_TRUE(ksck_->FetchInfoFromTabletServers().IsNetworkError());
ASSERT_OK(ksck_->PrintResults());
- string match_string = "Remote error: ID reported by tablet server "
- "($0) doesn't match the expected ID: $1";
- ASSERT_STR_CONTAINS(err_stream_.str(), strings::Substitute(match_string, new_uuid, old_uuid));
+ string match_string = "Error from $0: Remote error: ID reported by tablet server "
+ "($1) doesn't match the expected ID: $2 (WRONG_SERVER_UUID)";
+ ASSERT_STR_CONTAINS(err_stream_.str(), strings::Substitute(match_string, address.ToString(),
+ new_uuid, old_uuid));
+ tserver::MiniTabletServer* ts = mini_cluster_->mini_tablet_server(1);
+ ASSERT_STR_CONTAINS(err_stream_.str(), strings::Substitute("$0 | $1 | HEALTHY", ts->uuid(),
+ ts->bound_rpc_addr().ToString()));
+ ts = mini_cluster_->mini_tablet_server(2);
+ ASSERT_STR_CONTAINS(err_stream_.str(), strings::Substitute("$0 | $1 | HEALTHY", ts->uuid(),
+ ts->bound_rpc_addr().ToString()));
}
TEST_F(RemoteKsckTest, TestTableConsistency) {
http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck_remote.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote.cc b/src/kudu/tools/ksck_remote.cc
index 017321e..3c92eca 100644
--- a/src/kudu/tools/ksck_remote.cc
+++ b/src/kudu/tools/ksck_remote.cc
@@ -18,6 +18,7 @@
#include "kudu/tools/ksck_remote.h"
#include <cstdint>
+#include <map>
#include <ostream>
#include <unordered_map>
@@ -28,8 +29,8 @@
#include <glog/logging.h>
#include "kudu/client/client.h"
-#include "kudu/client/schema.h"
#include "kudu/client/replica_controller-internal.h"
+#include "kudu/client/schema.h"
#include "kudu/common/common.pb.h"
#include "kudu/common/schema.h"
#include "kudu/common/wire_protocol.h"
@@ -50,6 +51,8 @@
#include "kudu/server/server_base.pb.h"
#include "kudu/server/server_base.proxy.h"
#include "kudu/tablet/tablet.pb.h"
+#include "kudu/tools/ksck.h"
+#include "kudu/tools/ksck_results.h"
#include "kudu/tserver/tablet_server.h"
#include "kudu/tserver/tserver.pb.h"
#include "kudu/tserver/tserver_service.pb.h"
@@ -150,9 +153,10 @@ Status RemoteKsckTabletServer::Init() {
return Status::OK();
}
-Status RemoteKsckTabletServer::FetchInfo() {
+Status RemoteKsckTabletServer::FetchInfo(KsckServerHealth* health) {
+ DCHECK(health);
state_ = KsckFetchState::FETCH_FAILED;
-
+ *health = KsckServerHealth::UNAVAILABLE;
{
server::GetStatusRequestPB req;
server::GetStatusResponsePB resp;
@@ -162,6 +166,7 @@ Status RemoteKsckTabletServer::FetchInfo() {
"could not get status from server");
string response_uuid = resp.status().node_instance().permanent_uuid();
if (response_uuid != uuid()) {
+ *health = KsckServerHealth::WRONG_SERVER_UUID;
return Status::RemoteError(Substitute("ID reported by tablet server ($0) doesn't "
"match the expected ID: $1",
response_uuid, uuid()));
@@ -197,10 +202,13 @@ Status RemoteKsckTabletServer::FetchInfo() {
}
state_ = KsckFetchState::FETCHED;
+ *health = KsckServerHealth::HEALTHY;
return Status::OK();
}
-Status RemoteKsckTabletServer::FetchConsensusState() {
+Status RemoteKsckTabletServer::FetchConsensusState(KsckServerHealth* health) {
+ DCHECK(health);
+ *health = KsckServerHealth::UNAVAILABLE;
tablet_consensus_state_map_.clear();
consensus::GetConsensusStateRequestPB req;
consensus::GetConsensusStateResponsePB resp;
@@ -219,6 +227,7 @@ Status RemoteKsckTabletServer::FetchConsensusState() {
}
}
+ *health = KsckServerHealth::HEALTHY;
return Status::OK();
}
http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck_remote.h
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote.h b/src/kudu/tools/ksck_remote.h
index a718a8f..0ceda93 100644
--- a/src/kudu/tools/ksck_remote.h
+++ b/src/kudu/tools/ksck_remote.h
@@ -54,6 +54,8 @@ class TabletServerServiceProxy;
namespace tools {
+enum class KsckServerHealth;
+
// This implementation connects to a master via RPC.
class RemoteKsckMaster : public KsckMaster {
public:
@@ -93,9 +95,9 @@ class RemoteKsckTabletServer : public KsckTabletServer {
// Must be called after constructing.
Status Init();
- Status FetchInfo() override;
+ Status FetchInfo(KsckServerHealth* health) override;
- Status FetchConsensusState() override;
+ Status FetchConsensusState(KsckServerHealth* health) override;
void RunTabletChecksumScanAsync(
const std::string& tablet_id,
http://git-wip-us.apache.org/repos/asf/kudu/blob/2f61b72c/src/kudu/tools/ksck_results.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_results.cc b/src/kudu/tools/ksck_results.cc
index adcbfd7..a3b01af 100644
--- a/src/kudu/tools/ksck_results.cc
+++ b/src/kudu/tools/ksck_results.cc
@@ -295,7 +295,8 @@ Status PrintServerHealthSummaries(KsckServerType type,
// This isn't done as part of the table because the messages can be quite long.
for (const auto& s : summaries) {
if (s.health == KsckServerHealth::HEALTHY) continue;
- out << Substitute("Error from $1: $2", s.uuid, s.address, s.status.ToString()) << endl;
+ out << Substitute("Error from $0: $1 ($2)", s.address, s.status.ToString(),
+ ServerHealthToString(s.health)) << endl;
}
return Status::OK();
}